diff --git a/Analyzer/Analyzer.cpp b/Analyzer/Analyzer.cpp
index a6d3bed182..9d0b6b0cc9 100644
--- a/Analyzer/Analyzer.cpp
+++ b/Analyzer/Analyzer.cpp
@@ -3107,24 +3107,27 @@ std::string OffsetInFragment::toString() const {
 }
 
 std::string WindowFrame::toString() const {
+  std::ostringstream oss;
   auto bound_str = bound_expr_ ? bound_expr_->toString() : "None";
-  return ::toString(bound_type_) + " " + bound_str;
+  oss << bound_type_ << " " << bound_str;
+  return oss.str();
 }
 
 std::string WindowFunction::toString() const {
-  std::string result = "WindowFunction(" + ::toString(kind_);
+  std::ostringstream oss;
+  oss << "WindowFunction(" << kind_;
   for (const auto& arg : args_) {
-    result += " " + arg->toString();
+    oss << " " << arg->toString();
   }
   if (hasFraming()) {
-    result += " Frame{";
+    oss << " Frame{";
     switch (frame_bound_type_) {
       case FrameBoundType::ROW: {
-        result += "ROW";
+        oss << "ROW";
         break;
       }
       case FrameBoundType::RANGE: {
-        result += "RANGE";
+        oss << "RANGE";
         break;
       }
       default: {
@@ -3133,17 +3136,17 @@ std::string WindowFunction::toString() const {
         break;
       }
     }
-    result += " BETWEEN : " + frame_start_bound_->toString();
-    result += " AND : " + frame_end_bound_->toString();
+    oss << " BETWEEN : " + frame_start_bound_->toString();
+    oss << " AND : " + frame_end_bound_->toString();
   } else {
     if (!order_keys_.empty()) {
-      result += " (RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)";
+      oss << " (RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)";
     } else {
-      result += " (RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)";
+      oss << " (RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)";
     }
   }
-  result += "} ";
-  return result + ") ";
+  oss << "} )";
+  return oss.str();
 }
 
 std::string ArrayExpr::toString() const {
@@ -3763,6 +3766,11 @@ std::shared_ptr<Analyzer::Expr> RegexpSubstrStringOper::deep_copy() const {
       std::dynamic_pointer_cast<Analyzer::StringOper>(StringOper::deep_copy()));
 }
 
+std::shared_ptr<Analyzer::Expr> RegexpCountStringOper::deep_copy() const {
+  return makeExpr<Analyzer::RegexpCountStringOper>(
+      std::dynamic_pointer_cast<Analyzer::StringOper>(StringOper::deep_copy()));
+}
+
 std::shared_ptr<Analyzer::Expr> JsonValueStringOper::deep_copy() const {
   return makeExpr<Analyzer::JsonValueStringOper>(
       std::dynamic_pointer_cast<Analyzer::StringOper>(StringOper::deep_copy()));
@@ -3778,6 +3786,16 @@ std::shared_ptr<Analyzer::Expr> Base64DecodeStringOper::deep_copy() const {
       std::dynamic_pointer_cast<Analyzer::StringOper>(StringOper::deep_copy()));
 }
 
+std::shared_ptr<Analyzer::Expr> UrlEncodeStringOper::deep_copy() const {
+  return makeExpr<Analyzer::UrlEncodeStringOper>(
+      std::dynamic_pointer_cast<Analyzer::StringOper>(StringOper::deep_copy()));
+}
+
+std::shared_ptr<Analyzer::Expr> UrlDecodeStringOper::deep_copy() const {
+  return makeExpr<Analyzer::UrlDecodeStringOper>(
+      std::dynamic_pointer_cast<Analyzer::StringOper>(StringOper::deep_copy()));
+}
+
 std::shared_ptr<Analyzer::Expr> TryStringCastOper::deep_copy() const {
   return makeExpr<Analyzer::TryStringCastOper>(
       std::dynamic_pointer_cast<Analyzer::StringOper>(StringOper::deep_copy()));
@@ -3798,6 +3816,11 @@ std::shared_ptr<Analyzer::Expr> LevenshteinDistanceStringOper::deep_copy() const
       std::dynamic_pointer_cast<Analyzer::StringOper>(StringOper::deep_copy()));
 }
 
+std::shared_ptr<Analyzer::Expr> HashStringOper::deep_copy() const {
+  return makeExpr<Analyzer::HashStringOper>(
+      std::dynamic_pointer_cast<Analyzer::StringOper>(StringOper::deep_copy()));
+}
+
 std::shared_ptr<Analyzer::Expr> FunctionOper::deep_copy() const {
   std::vector<std::shared_ptr<Analyzer::Expr>> args_copy;
   for (size_t i = 0; i < getArity(); ++i) {
@@ -4075,13 +4098,19 @@ std::shared_ptr<Analyzer::Constant> GeoConstant::makePhysicalConstant(
   std::vector<int> poly_rings;
 
   const bool validate_with_geos_if_available = false;
-  Geospatial::GeoTypesFactory::getGeoColumns(geo_->getWktString(),
-                                             ti,
-                                             coords,
-                                             bounds,
-                                             ring_sizes,
-                                             poly_rings,
-                                             validate_with_geos_if_available);
+  bool success =
+      Geospatial::GeoTypesFactory::getGeoColumns(geo_->getWktString(),
+                                                 ti,
+                                                 coords,
+                                                 bounds,
+                                                 ring_sizes,
+                                                 poly_rings,
+                                                 validate_with_geos_if_available);
+  if (!success) {
+    std::ostringstream oss;
+    oss << "Failed to create geometry from WKT string: " << geo_->getWktString();
+    throw std::runtime_error(oss.str());
+  }
 
   switch (index) {
     case 0:  // coords
diff --git a/Analyzer/Analyzer.h b/Analyzer/Analyzer.h
index 27237fe7f1..af1f2b836e 100644
--- a/Analyzer/Analyzer.h
+++ b/Analyzer/Analyzer.h
@@ -2359,6 +2359,46 @@ class RegexpSubstrStringOper : public StringOper {
             "sub-match group index"};
   }
 };
+
+class RegexpCountStringOper : public StringOper {
+ public:
+  RegexpCountStringOper(const std::shared_ptr<Analyzer::Expr>& operand,
+                        const std::shared_ptr<Analyzer::Expr>& regex_pattern,
+                        const std::shared_ptr<Analyzer::Expr>& start_pos,
+                        const std::shared_ptr<Analyzer::Expr>& regex_params)
+      : StringOper(SqlStringOpKind::REGEXP_COUNT,
+                   SQLTypeInfo(kBIGINT),
+                   {operand, regex_pattern, start_pos, regex_params},
+                   getMinArgs(),
+                   getExpectedTypeFamilies(),
+                   getArgNames()) {}
+
+  RegexpCountStringOper(const std::vector<std::shared_ptr<Analyzer::Expr>>& operands)
+      : StringOper(SqlStringOpKind::REGEXP_COUNT,
+                   SQLTypeInfo(kBIGINT),
+                   operands,
+                   getMinArgs(),
+                   getExpectedTypeFamilies(),
+                   getArgNames()) {}
+
+  RegexpCountStringOper(const std::shared_ptr<Analyzer::StringOper>& string_oper)
+      : StringOper(string_oper) {}
+
+  std::shared_ptr<Analyzer::Expr> deep_copy() const override;
+
+  size_t getMinArgs() const override { return 4UL; }
+
+  std::vector<OperandTypeFamily> getExpectedTypeFamilies() const override {
+    return {OperandTypeFamily::STRING_FAMILY,
+            OperandTypeFamily::STRING_FAMILY,
+            OperandTypeFamily::INT_FAMILY,
+            OperandTypeFamily::STRING_FAMILY};
+  }
+  std::vector<std::string> getArgNames() const override {
+    return {"operand", "regex pattern", "start position", "regex parameters"};
+  }
+};
+
 class JsonValueStringOper : public StringOper {
  public:
   JsonValueStringOper(const std::shared_ptr<Analyzer::Expr>& operand,
@@ -2449,6 +2489,66 @@ class Base64DecodeStringOper : public StringOper {
   std::vector<std::string> getArgNames() const override { return {"operand"}; }
 };
 
+class UrlEncodeStringOper : public StringOper {
+ public:
+  UrlEncodeStringOper(const std::shared_ptr<Analyzer::Expr>& operand)
+      : StringOper(SqlStringOpKind::URL_ENCODE,
+                   {operand},
+                   getMinArgs(),
+                   getExpectedTypeFamilies(),
+                   getArgNames()) {}
+
+  UrlEncodeStringOper(const std::vector<std::shared_ptr<Analyzer::Expr>>& operands)
+      : StringOper(SqlStringOpKind::URL_ENCODE,
+                   operands,
+                   getMinArgs(),
+                   getExpectedTypeFamilies(),
+                   getArgNames()) {}
+
+  UrlEncodeStringOper(const std::shared_ptr<Analyzer::StringOper>& string_oper)
+      : StringOper(string_oper) {}
+
+  std::shared_ptr<Analyzer::Expr> deep_copy() const override;
+
+  size_t getMinArgs() const override { return 1u; }
+
+  std::vector<OperandTypeFamily> getExpectedTypeFamilies() const override {
+    return {OperandTypeFamily::STRING_FAMILY};
+  }
+
+  std::vector<std::string> getArgNames() const override { return {"operand"}; }
+};
+
+class UrlDecodeStringOper : public StringOper {
+ public:
+  UrlDecodeStringOper(const std::shared_ptr<Analyzer::Expr>& operand)
+      : StringOper(SqlStringOpKind::URL_DECODE,
+                   {operand},
+                   getMinArgs(),
+                   getExpectedTypeFamilies(),
+                   getArgNames()) {}
+
+  UrlDecodeStringOper(const std::vector<std::shared_ptr<Analyzer::Expr>>& operands)
+      : StringOper(SqlStringOpKind::URL_DECODE,
+                   operands,
+                   getMinArgs(),
+                   getExpectedTypeFamilies(),
+                   getArgNames()) {}
+
+  UrlDecodeStringOper(const std::shared_ptr<Analyzer::StringOper>& string_oper)
+      : StringOper(string_oper) {}
+
+  std::shared_ptr<Analyzer::Expr> deep_copy() const override;
+
+  size_t getMinArgs() const override { return 1u; }
+
+  std::vector<OperandTypeFamily> getExpectedTypeFamilies() const override {
+    return {OperandTypeFamily::STRING_FAMILY};
+  }
+
+  std::vector<std::string> getArgNames() const override { return {"operand"}; }
+};
+
 class TryStringCastOper : public StringOper {
  public:
   TryStringCastOper(const SQLTypeInfo& ti, const std::shared_ptr<Analyzer::Expr>& operand)
@@ -2603,6 +2703,37 @@ class LevenshteinDistanceStringOper : public StringOper {
       const std::vector<std::shared_ptr<Analyzer::Expr>>& operands);
 };
 
+class HashStringOper : public StringOper {
+ public:
+  HashStringOper(const std::shared_ptr<Analyzer::Expr>& operand)
+      : StringOper(SqlStringOpKind::HASH,
+                   SQLTypeInfo(kBIGINT),
+                   {operand},
+                   getMinArgs(),
+                   getExpectedTypeFamilies(),
+                   getArgNames()) {}
+
+  HashStringOper(const std::vector<std::shared_ptr<Analyzer::Expr>>& operands)
+      : StringOper(SqlStringOpKind::HASH,
+                   SQLTypeInfo(kBIGINT),
+                   operands,
+                   getMinArgs(),
+                   getExpectedTypeFamilies(),
+                   getArgNames()) {}
+
+  HashStringOper(const std::shared_ptr<Analyzer::StringOper>& string_oper)
+      : StringOper(string_oper) {}
+
+  std::shared_ptr<Analyzer::Expr> deep_copy() const override;
+
+  size_t getMinArgs() const override { return 1UL; }
+
+  std::vector<OperandTypeFamily> getExpectedTypeFamilies() const override {
+    return {OperandTypeFamily::STRING_FAMILY};
+  }
+  std::vector<std::string> getArgNames() const override { return {"operand"}; }
+};
+
 class FunctionOper : public Expr {
  public:
   FunctionOper(const SQLTypeInfo& ti,
diff --git a/Benchmarks/conbench/report.css b/Benchmarks/conbench/report.css
new file mode 100644
index 0000000000..2b23f2ad65
--- /dev/null
+++ b/Benchmarks/conbench/report.css
@@ -0,0 +1,9 @@
+body { font-family: sans-serif }
+table { border-collapse: collapse }
+th { text-align: right; padding-right: 1em }
+td { font-family: monospace; text-align: right; padding-right: 1em }
+td.fixed { background-color: LightGreen }
+td.check { background-color: Khaki }
+td.warning { background-color: Yellow }
+td.error { background-color: Red }
+tr:nth-child(even) { background-color: LightCyan }
diff --git a/Benchmarks/conbench/report.py b/Benchmarks/conbench/report.py
index e8a02880b3..b2e9cb7115 100644
--- a/Benchmarks/conbench/report.py
+++ b/Benchmarks/conbench/report.py
@@ -213,17 +213,7 @@ def summary_body_rows():
 <html>
 <head>
   <title>Benchmarks for {branch} / {short_sha} on {host}</title>
-  <style>
-body {{ font-family: sans-serif }}
-table {{ border-collapse: collapse }}
-th {{ text-align: right; padding-right: 1em }}
-td {{ font-family: monospace; text-align: right; padding-right: 1em }}
-td.fixed {{ background-color: LightGreen }}
-td.check {{ background-color: Khaki }}
-td.warning {{ background-color: Yellow }}
-td.error {{ background-color: Red }}
-tr:nth-child(even) {{ background-color: LightCyan }}
-  </style>
+  <link rel="stylesheet" href="report.css">
 </head>
 <body>
 <h1>Benchmarks for {branch} / <a href="{commit_url}">{short_sha}</a> on {host}</h1>
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 30e5cb5211..9810fbeb09 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -89,12 +89,13 @@ add_definitions("-DMAPD_EDITION_${MAPD_EDITION}")
 string(TOLOWER "${MAPD_EDITION}" MAPD_EDITION_LOWER)
 
 # HeavyDB version number
-set(MAPD_VERSION_MAJOR "7")
-set(MAPD_VERSION_MINOR "1")
+set(MAPD_VERSION_MAJOR "8")
+set(MAPD_VERSION_MINOR "0")
 set(MAPD_VERSION_PATCH "0")
 set(MAPD_VERSION_EXTRA "dev")
 set(MAPD_VERSION_RAW "${MAPD_VERSION_MAJOR}.${MAPD_VERSION_MINOR}.${MAPD_VERSION_PATCH}${MAPD_VERSION_EXTRA}")
-set(MAPD_IMMERSE_URL "http://builds.mapd.com/frontend/mapd2-dashboard-v2-137-release-prod.zip")
+set(MAPD_IMMERSE_BUILD_ID "immerse-v2-latest-master-prod" CACHE STRING "Immerse Build ID")
+set(MAPD_IMMERSE_URL "http://builds.mapd.com/frontend/${MAPD_IMMERSE_BUILD_ID}.zip")
 string(TIMESTAMP MAPD_BUILD_DATE "%Y%m%d")
 
 if($ENV{BUILD_NUMBER})
@@ -134,6 +135,31 @@ macro(set_alternate_linker linker)
   endif()
 endmacro()
 
+macro(InstallVersionFile)
+  # `touch mapd_deps_version.txt` from build dir to silence warning.
+  find_file(MapdDepsVersion_FILE mapd_deps_version.txt PATH ${CMAKE_BINARY_DIR} NO_CACHE)
+
+  if(NOT MapdDepsVersion_FILE)
+    message(WARNING "Build could NOT find deps version file mapd_deps_version.txt")
+  else()
+    message(STATUS "Found deps version file ${MapdDepsVersion_FILE}")
+    file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/version)
+    set(HEAVY_DEPS_CUSTOM ${CMAKE_BINARY_DIR}/version/heavyai_deps_version.txt)
+    # cp mapd_version data to heavyai deps, removing un-tagged lines.
+    add_custom_command(OUTPUT ${HEAVY_DEPS_CUSTOM}
+      DEPENDS ${MapdDepsVersion_FILE}
+      COMMAND ${CMAKE_COMMAND} -E copy ${MapdDepsVersion_FILE} ${HEAVY_DEPS_CUSTOM}
+      # In the copied deps files leave the first line with the deps generated info
+      # and any other line starting with 'Public Release:', though remove the public release tag
+      VERBATIM
+      COMMAND "sed" "-i" "-n" "/^Public Release:/s/^Public Release://p" ${HEAVY_DEPS_CUSTOM})
+
+    add_custom_target(HeavyDepsVersionTarget DEPENDS ${HEAVY_DEPS_CUSTOM})
+    add_dependencies(heavydb HeavyDepsVersionTarget)
+    install(FILES ${HEAVY_DEPS_CUSTOM} DESTINATION "."  COMPONENT "doc")
+  endif()
+endmacro()
+
 set(USE_ALTERNATE_LINKER "" CACHE STRING "Use alternate linker. Leave empty for system default; alternatives are 'gold', 'lld', 'bfd', 'mold'")
 if(NOT "${USE_ALTERNATE_LINKER}" STREQUAL "")
   set_alternate_linker(${USE_ALTERNATE_LINKER})
@@ -192,9 +218,8 @@ endif()
 option(ENABLE_CUDA "Enable CUDA support" ON)
 if(ENABLE_CUDA)
   enable_language(CUDA)
-  find_package(CUDA REQUIRED)
-  include_directories(${CUDA_INCLUDE_DIRS})
-  list(APPEND CUDA_LIBRARIES ${CUDA_CUDA_LIBRARY})
+  include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+  list(APPEND CUDA_LIBRARIES cuda)
   add_definitions("-DHAVE_CUDA")
 
 else()
@@ -918,10 +943,6 @@ target_link_libraries(mapd_thrift ${Thrift_LIBRARIES})
 
 if("${MAPD_EDITION_LOWER}" STREQUAL "ee")
   option(ENABLE_OMNIVERSE_CONNECTOR "Enable Omniverse Connector" ON)
-  if(ENABLE_OMNIVERSE_CONNECTOR)
-    include_directories(ThirdParty/poly2tri)
-    add_subdirectory(ThirdParty/poly2tri)
-  endif()
   include_directories(Catalog/ee)
   include_directories(Distributed/ee)
 else()
@@ -992,7 +1013,7 @@ if(ENABLE_RUNTIME_LIBS)
       find_package(Torch REQUIRED)
       if (DEFINED TORCH_CUDA_LIBRARIES)
         # Torch removes the CUDA Driver target from CUDA_LIBRARIES when doing its own CUDA CMake setup, so we have to re-add it
-        list(APPEND CUDA_LIBRARIES ${CUDA_CUDA_LIBRARY})
+        list(APPEND CUDA_LIBRARIES cuda)
         add_compile_definitions("HAVE_CUDA_TORCH")
       endif()
       add_compile_definitions("HAVE_TORCH_TFS")
@@ -1107,6 +1128,7 @@ endif()
 
 add_executable(heavydb HeavyDB.cpp ${CMAKE_BINARY_DIR}/MapDRelease.h)
 set_target_properties(heavydb PROPERTIES COMPILE_DEFINITIONS "${TIME_LIMITED_DEFINITIONS}")
+InstallVersionFile()
 
 add_custom_command(
     DEPENDS ${CMAKE_SOURCE_DIR}/heavy.thrift
diff --git a/Catalog/Catalog.cpp b/Catalog/Catalog.cpp
index 473dc4f082..7d44626288 100644
--- a/Catalog/Catalog.cpp
+++ b/Catalog/Catalog.cpp
@@ -97,7 +97,10 @@ bool g_enable_fsi{true};
 bool g_enable_s3_fsi{false};
 int32_t g_distributed_leaf_idx{-1};
 int32_t g_distributed_num_leaves{0};
-bool g_enable_logs_system_tables{false};
+bool g_enable_logs_system_tables{true};
+bool g_enable_logs_system_tables_auto_refresh{false};
+// 10 minutes refresh interval by default
+std::string g_logs_system_tables_refresh_interval{"600S"};
 extern bool g_cache_string_hash;
 extern bool g_enable_system_tables;
 
@@ -2214,12 +2217,19 @@ list<const DashboardDescriptor*> Catalog::getAllDashboardsMetadata() const {
   return dashboards;
 }
 
-std::vector<DashboardDescriptor> Catalog::getAllDashboardsMetadataCopy() const {
+std::vector<DashboardDescriptor> Catalog::getAllDashboardsMetadataForSysTable() const {
   cat_read_lock read_lock(this);
   std::vector<DashboardDescriptor> dashboards;
   dashboards.reserve(dashboardDescriptorMap_.size());
   for (auto dashboard_entry : dashboardDescriptorMap_) {
-    dashboards.emplace_back(*dashboard_entry.second);
+    const auto& cat_dashboard = dashboard_entry.second;
+    dashboards.emplace_back();
+    auto& dashboard = dashboards.back();
+    dashboard.dashboardId = cat_dashboard->dashboardId;
+    dashboard.dashboardName = cat_dashboard->dashboardName;
+    dashboard.userId = cat_dashboard->userId;
+    dashboard.updateTime = cat_dashboard->updateTime;
+    dashboard.dashboardMetadata = cat_dashboard->dashboardMetadata;
   }
   return dashboards;
 }
@@ -6435,8 +6445,19 @@ inline SQLTypeInfo get_var_encoded_text_array_type() {
 
 void set_common_log_system_table_options(foreign_storage::ForeignTable& foreign_table) {
   using foreign_storage::ForeignTable;
-  foreign_table.options[ForeignTable::REFRESH_TIMING_TYPE_KEY] =
-      ForeignTable::MANUAL_REFRESH_TIMING_TYPE;
+  if (g_enable_logs_system_tables_auto_refresh) {
+    foreign_table.options[ForeignTable::REFRESH_TIMING_TYPE_KEY] =
+        ForeignTable::SCHEDULE_REFRESH_TIMING_TYPE;
+    // Set start date time to 1 minute from now.
+    auto start_epoch = foreign_storage::RefreshTimeCalculator::getCurrentTime() + 60;
+    foreign_table.options[ForeignTable::REFRESH_START_DATE_TIME_KEY] =
+        shared::convert_temporal_to_iso_format({kTIMESTAMP}, start_epoch);
+    foreign_table.options[ForeignTable::REFRESH_INTERVAL_KEY] =
+        g_logs_system_tables_refresh_interval;
+  } else {
+    foreign_table.options[ForeignTable::REFRESH_TIMING_TYPE_KEY] =
+        ForeignTable::MANUAL_REFRESH_TIMING_TYPE;
+  }
   foreign_table.options[ForeignTable::REFRESH_UPDATE_TYPE_KEY] =
       ForeignTable::APPEND_REFRESH_UPDATE_TYPE;
   using foreign_storage::AbstractFileStorageDataWrapper;
@@ -6467,6 +6488,8 @@ void clear_cached_table_data(const Data_Namespace::DataMgr* data_mgr,
 void drop_tables(Catalog& catalog, const std::vector<std::string>& table_names) {
   for (const auto& table_name : table_names) {
     if (auto td = catalog.getMetadataForTable(table_name)) {
+      clear_cached_table_data(
+          &catalog.getDataMgr(), catalog.getDatabaseId(), td->tableId);
       catalog.dropTable(td);
     }
   }
diff --git a/Catalog/Catalog.h b/Catalog/Catalog.h
index e7b6ed0b9c..6accb6280b 100644
--- a/Catalog/Catalog.h
+++ b/Catalog/Catalog.h
@@ -261,7 +261,7 @@ class Catalog final {
   std::list<const TableDescriptor*> getAllTableMetadata() const;
   std::vector<TableDescriptor> getAllTableMetadataCopy() const;
   std::list<const DashboardDescriptor*> getAllDashboardsMetadata() const;
-  std::vector<DashboardDescriptor> getAllDashboardsMetadataCopy() const;
+  std::vector<DashboardDescriptor> getAllDashboardsMetadataForSysTable() const;
   const DBMetadata& getCurrentDB() const { return currentDB_; }
   Data_Namespace::DataMgr& getDataMgr() const { return *dataMgr_; }
   std::shared_ptr<Calcite> getCalciteMgr() const { return calciteMgr_; }
diff --git a/Catalog/DdlCommandExecutor.cpp b/Catalog/DdlCommandExecutor.cpp
index 7823f5537e..389b1121b1 100644
--- a/Catalog/DdlCommandExecutor.cpp
+++ b/Catalog/DdlCommandExecutor.cpp
@@ -2241,7 +2241,7 @@ ShowModelFeatureDetailsCommand::extractExtraMetadata(
 #ifdef HAVE_ONEDAL
     case MLModelType::RANDOM_FOREST_REG: {
       const auto random_forest_reg_model =
-          std::dynamic_pointer_cast<RandomForestRegressionModel>(model);
+          std::dynamic_pointer_cast<AbstractRandomForestModel>(model);
       extra_metadata = random_forest_reg_model->getVariableImportanceScores();
       if (!extra_metadata.empty()) {
         label_infos.emplace_back("feature_importance", SQLTypeInfo(kDOUBLE, true));
diff --git a/DataMgr/Allocators/CpuMgrArenaAllocator.cpp b/DataMgr/Allocators/CpuMgrArenaAllocator.cpp
new file mode 100644
index 0000000000..0acacade6f
--- /dev/null
+++ b/DataMgr/Allocators/CpuMgrArenaAllocator.cpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright 2023 HEAVY.AI, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CpuMgrArenaAllocator.h"
+
+#include "Catalog/SysCatalog.h"
+
+namespace {
+Data_Namespace::DataMgr& get_data_mgr_instance() {
+  const auto& sys_catalog = Catalog_Namespace::SysCatalog::instance();
+  CHECK(sys_catalog.isInitialized());
+  return sys_catalog.getDataMgr();
+}
+}  // namespace
+
+CpuMgrArenaAllocator::CpuMgrArenaAllocator()
+    : data_mgr_(get_data_mgr_instance()), size_(0) {}
+
+CpuMgrArenaAllocator::~CpuMgrArenaAllocator() {
+  for (auto buffer : allocated_buffers_) {
+    data_mgr_.free(buffer);
+  }
+}
+
+void* CpuMgrArenaAllocator::allocate(size_t num_bytes) {
+  if (num_bytes == 0) {
+    return nullptr;
+  }
+  AbstractBuffer* buffer = nullptr;
+  try {
+    buffer = data_mgr_.alloc(Data_Namespace::CPU_LEVEL, 0, num_bytes);
+  } catch (const OutOfMemory& e) {
+    LOG(ERROR) << e.what();
+    throw OutOfHostMemory(num_bytes);
+  }
+  CHECK(buffer);
+  allocated_buffers_.emplace_back(buffer);
+
+  auto mem_ptr = buffer->getMemoryPtr();
+  CHECK(mem_ptr);
+  size_ += num_bytes;
+  return mem_ptr;
+}
+
+void* CpuMgrArenaAllocator::allocateAndZero(const size_t num_bytes) {
+  auto ret = allocate(num_bytes);
+  std::memset(ret, 0, num_bytes);
+  return ret;
+}
+
+size_t CpuMgrArenaAllocator::bytesUsed() const {
+  return size_;
+}
+
+size_t CpuMgrArenaAllocator::totalBytes() const {
+  return size_;
+}
+
+Arena::MemoryType CpuMgrArenaAllocator::getMemoryType() const {
+  return Arena::MemoryType::DRAM;
+}
diff --git a/DataMgr/Allocators/CpuMgrArenaAllocator.h b/DataMgr/Allocators/CpuMgrArenaAllocator.h
new file mode 100644
index 0000000000..937d9de037
--- /dev/null
+++ b/DataMgr/Allocators/CpuMgrArenaAllocator.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright 2023 HEAVY.AI, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file    CpuMgrArenaAllocator.h
+ * @brief   Allocate CPU memory using CpuBuffers via DataMgr.
+ */
+
+#pragma once
+
+#include "DataMgr/Allocators/ArenaAllocator.h"
+
+namespace Data_Namespace {
+class AbstractBuffer;
+class DataMgr;
+}  // namespace Data_Namespace
+
+class CpuMgrArenaAllocator : public Arena {
+ public:
+  CpuMgrArenaAllocator();
+
+  ~CpuMgrArenaAllocator() override;
+
+  void* allocate(size_t num_bytes) override;
+
+  void* allocateAndZero(const size_t num_bytes) override;
+
+  size_t bytesUsed() const override;
+
+  size_t totalBytes() const override;
+
+  MemoryType getMemoryType() const override;
+
+ private:
+  Data_Namespace::DataMgr& data_mgr_;
+  std::vector<Data_Namespace::AbstractBuffer*> allocated_buffers_;
+  size_t size_;
+};
diff --git a/DataMgr/Allocators/FastAllocator.h b/DataMgr/Allocators/FastAllocator.h
new file mode 100644
index 0000000000..d17fb4b178
--- /dev/null
+++ b/DataMgr/Allocators/FastAllocator.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright 2023 HEAVY.AI, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file    FastAllocator.h
+ * @brief   Quickly allocate many memory pieces by reserving them ahead of time.
+ *          Calls to allocate() are thread-safe.
+ */
+
+#pragma once
+
+#include "Shared/SimpleAllocator.h"
+
+#include <algorithm>
+#include <exception>
+#include <mutex>
+#include <sstream>
+
+namespace heavyai {
+namespace allocator {
+namespace detail {
+
+inline std::runtime_error outOfMemoryError(size_t n, size_t remaining, size_t capacity) {
+  std::ostringstream oss;
+  oss << "allocate(" << n << ") called but only " << remaining << " out of " << capacity
+      << " available.";
+  return std::runtime_error(oss.str());
+}
+
+// FastAllocator accepts a pre-allocated buffer of given capacity and
+// allocates sequential chunks, tracking the size_ starting at size_=0.
+// If size_ exceeds capacity_ then throw an exception.
+// There is no deallocate() function, nor is there a destructor.
+template <typename T>
+class FastAllocator : public SimpleAllocator {
+ public:
+  FastAllocator() : buffer_(nullptr), capacity_(0), size_(0) {}
+  FastAllocator(T* buffer, size_t capacity)
+      : buffer_(buffer), capacity_(capacity), size_(0) {}
+  FastAllocator(FastAllocator const&) = delete;
+  FastAllocator(FastAllocator&&) = delete;
+  FastAllocator& operator=(FastAllocator const&) = delete;
+  FastAllocator& operator=(FastAllocator&& rhs) {
+    buffer_ = rhs.buffer_;
+    capacity_ = rhs.capacity_;
+    size_ = rhs.size_;
+    rhs.reset();
+    return *this;
+  }
+
+  // Allocate n>0 elements of type T. Caller responsible for proper data alignment.
+  T* allocate(size_t const n) {
+    CHECK(n);
+    std::lock_guard<std::mutex> lock_guard(mutex_);
+    if (n <= available()) {
+      T* const ptr = buffer_ + size_;
+      size_ += n;
+      return ptr;
+    }
+    throw outOfMemoryError(n, available(), capacity_);
+  }
+
+  size_t available() const { return capacity_ - size_; }  // number of available elements
+  size_t capacity() const { return capacity_; }           // number of reserved elements
+
+ protected:
+  void reset() {
+    buffer_ = nullptr;
+    capacity_ = 0u;
+    size_ = 0u;
+  }
+
+  T* buffer_;        // Pointer to reserved buffer.
+  size_t capacity_;  // Number of elements of type T reserved.
+  size_t size_;      // Number of elements of type T allocated.
+  mutable std::mutex mutex_;
+};
+
+}  // namespace detail
+}  // namespace allocator
+}  // namespace heavyai
+
+using heavyai::allocator::detail::FastAllocator;
diff --git a/DataMgr/BufferMgr/BufferMgr.cpp b/DataMgr/BufferMgr/BufferMgr.cpp
index 3fd6e64c22..75c6160a9b 100644
--- a/DataMgr/BufferMgr/BufferMgr.cpp
+++ b/DataMgr/BufferMgr/BufferMgr.cpp
@@ -48,33 +48,40 @@ BufferMgr::BufferMgr(const int device_id,
                      const size_t max_buffer_pool_size,
                      const size_t min_slab_size,
                      const size_t max_slab_size,
+                     const size_t default_slab_size,
                      const size_t page_size,
                      AbstractBufferMgr* parent_mgr)
     : AbstractBufferMgr(device_id)
     , max_buffer_pool_size_(max_buffer_pool_size)
     , min_slab_size_(min_slab_size)
     , max_slab_size_(max_slab_size)
+    , default_slab_size_(default_slab_size)
     , page_size_(page_size)
     , num_pages_allocated_(0)
     , allocations_capped_(false)
     , parent_mgr_(parent_mgr)
     , max_buffer_id_(0)
     , buffer_epoch_(0) {
-  CHECK(max_buffer_pool_size_ > 0);
-  CHECK(page_size_ > 0);
+  CHECK_GT(max_buffer_pool_size_, size_t(0));
+  CHECK_GT(page_size_, size_t(0));
   // TODO change checks on run-time configurable slab size variables to exceptions
-  CHECK(min_slab_size_ > 0);
-  CHECK(max_slab_size_ > 0);
-  CHECK(min_slab_size_ <= max_slab_size_);
-  CHECK(min_slab_size_ % page_size_ == 0);
-  CHECK(max_slab_size_ % page_size_ == 0);
+  CHECK_GT(min_slab_size_, size_t(0));
+  CHECK_GT(max_slab_size_, size_t(0));
+  CHECK_GT(default_slab_size_, size_t(0));
+  CHECK_LE(min_slab_size_, max_slab_size_);
+  CHECK_GE(default_slab_size_, min_slab_size_);
+  CHECK_LE(default_slab_size_, max_slab_size_);
+  CHECK_EQ(min_slab_size_ % page_size_, size_t(0));
+  CHECK_EQ(max_slab_size_ % page_size_, size_t(0));
+  CHECK_EQ(default_slab_size_ % page_size_, size_t(0));
 
   max_buffer_pool_num_pages_ = max_buffer_pool_size_ / page_size_;
   max_num_pages_per_slab_ = max_slab_size_ / page_size_;
   min_num_pages_per_slab_ = min_slab_size_ / page_size_;
-  current_max_slab_page_size_ =
-      max_num_pages_per_slab_;  // current_max_slab_page_size_ will drop as allocations
-                                // fail - this is the high water mark
+  default_num_pages_per_slab_ = default_slab_size_ / page_size_;
+  current_max_num_pages_per_slab_ =
+      max_num_pages_per_slab_;  // current_max_num_pages_per_slab_ will drop as
+                                // allocations fail - this is the high water mark
 }
 
 /// Frees the heap-allocated buffer pool memory
@@ -84,9 +91,9 @@ BufferMgr::~BufferMgr() {
 
 void BufferMgr::reinit() {
   num_pages_allocated_ = 0;
-  current_max_slab_page_size_ =
-      max_num_pages_per_slab_;  // current_max_slab_page_size_ will drop as allocations
-                                // fail - this is the high water mark
+  current_max_num_pages_per_slab_ =
+      max_num_pages_per_slab_;  // current_max_num_pages_per_slab_ will drop as
+                                // allocations fail - this is the high water mark
   allocations_capped_ = false;
 }
 
@@ -296,48 +303,58 @@ BufferList::iterator BufferMgr::findFreeBuffer(size_t num_bytes) {
   // If we're here then we didn't find a free segment of sufficient size
   // First we see if we can add another slab
   while (!allocations_capped_ && num_pages_allocated_ < max_buffer_pool_num_pages_) {
+    size_t allocated_num_pages{0};
     try {
-      size_t pagesLeft = max_buffer_pool_num_pages_ - num_pages_allocated_;
-      if (pagesLeft < current_max_slab_page_size_) {
-        current_max_slab_page_size_ = pagesLeft;
+      auto pages_left = max_buffer_pool_num_pages_ - num_pages_allocated_;
+      if (pages_left < current_max_num_pages_per_slab_) {
+        current_max_num_pages_per_slab_ = pages_left;
       }
       if (num_pages_requested <=
-          current_max_slab_page_size_) {  // don't try to allocate if the
-                                          // new slab won't be big enough
-        auto alloc_ms = measure<>::execution(
-            [&]() { addSlab(current_max_slab_page_size_ * page_size_); });
-        LOG(INFO) << "ALLOCATION slab of " << current_max_slab_page_size_ << " pages ("
-                  << current_max_slab_page_size_ * page_size_ << "B) created in "
-                  << alloc_ms << " ms " << getStringMgrType() << ":" << device_id_;
+          current_max_num_pages_per_slab_) {  // don't try to allocate if the
+                                              // new slab won't be big enough
+        if (default_num_pages_per_slab_ < current_max_num_pages_per_slab_) {
+          allocated_num_pages =
+              std::max(default_num_pages_per_slab_, num_pages_requested);
+        } else {
+          allocated_num_pages = current_max_num_pages_per_slab_;
+        }
+        const auto slab_in_bytes = allocated_num_pages * page_size_;
+        VLOG(1) << "Try to allocate SLAB of " << allocated_num_pages << " pages ("
+                << slab_in_bytes << " bytes) on " << getStringMgrType() << ":"
+                << device_id_;
+        auto alloc_ms = measure<>::execution([&]() { addSlab(slab_in_bytes); });
+        LOG(INFO) << "ALLOCATION slab of " << allocated_num_pages << " pages ("
+                  << slab_in_bytes << "B) created in " << alloc_ms << " ms "
+                  << getStringMgrType() << ":" << device_id_;
       } else {
         break;
       }
       // if here then addSlab succeeded
-      num_pages_allocated_ += current_max_slab_page_size_;
+      CHECK_GT(allocated_num_pages, size_t(0));
+      num_pages_allocated_ += allocated_num_pages;
       return findFreeBufferInSlab(
           num_slabs,
           num_pages_requested);  // has to succeed since we made sure to request a slab
                                  // big enough to accomodate request
     } catch (std::runtime_error& error) {  // failed to allocate slab
-      LOG(INFO) << "ALLOCATION Attempted slab of " << current_max_slab_page_size_
-                << " pages (" << current_max_slab_page_size_ * page_size_ << "B) failed "
+      LOG(INFO) << "ALLOCATION Attempted slab of " << allocated_num_pages << " pages ("
+                << (allocated_num_pages * page_size_) << "B) failed "
                 << getStringMgrType() << ":" << device_id_;
       // check if there is any point halving currentMaxSlabSize and trying again
       // if the request wont fit in half available then let try once at full size
       // if we have already tries at full size and failed then break as
       // there could still be room enough for other later request but
       // not for his current one
-      if (num_pages_requested > current_max_slab_page_size_ / 2 &&
-          current_max_slab_page_size_ != num_pages_requested) {
-        current_max_slab_page_size_ = num_pages_requested;
+      if (num_pages_requested > current_max_num_pages_per_slab_ / 2 &&
+          current_max_num_pages_per_slab_ != num_pages_requested) {
+        current_max_num_pages_per_slab_ = num_pages_requested;
       } else {
-        current_max_slab_page_size_ /= 2;
-        if (current_max_slab_page_size_ <
-            (min_num_pages_per_slab_)) {  // should be a constant
+        current_max_num_pages_per_slab_ /= 2;
+        if (current_max_num_pages_per_slab_ < min_num_pages_per_slab_) {
           allocations_capped_ = true;
           // dump out the slabs and their sizes
-          LOG(INFO) << "ALLOCATION Capped " << current_max_slab_page_size_
-                    << " Minimum size = " << (min_num_pages_per_slab_) << " "
+          LOG(INFO) << "ALLOCATION Capped " << current_max_num_pages_per_slab_
+                    << " Minimum size = " << min_num_pages_per_slab_ << " "
                     << getStringMgrType() << ":" << device_id_;
         }
       }
diff --git a/DataMgr/BufferMgr/BufferMgr.h b/DataMgr/BufferMgr/BufferMgr.h
index 1e6b9bab1e..ffa508f6d7 100644
--- a/DataMgr/BufferMgr/BufferMgr.h
+++ b/DataMgr/BufferMgr/BufferMgr.h
@@ -101,6 +101,7 @@ class BufferMgr : public AbstractBufferMgr {  // implements
             const size_t max_buffer_size,
             const size_t min_slab_size,
             const size_t max_slab_size,
+            const size_t default_slab_size,
             const size_t page_size,
             AbstractBufferMgr* parent_mgr = 0);
 
@@ -172,8 +173,11 @@ class BufferMgr : public AbstractBufferMgr {  // implements
       max_buffer_pool_size_;    /// max number of bytes allocated for the buffer pool
   const size_t min_slab_size_;  /// minimum size of the individual memory allocations that
                                 /// compose the buffer pool (up to maxBufferSize_)
-  const size_t max_slab_size_;  /// size of the individual memory allocations that compose
-                                /// the buffer pool (up to maxBufferSize_)
+  const size_t max_slab_size_;  /// max size of the individual memory allocations that
+                                /// compose the buffer pool (up to maxBufferSize_)
+  const size_t
+      default_slab_size_;  /// default size of the individual memory allocations that
+                           /// compose the buffer pool (up to maxBufferSize_)
   const size_t page_size_;
   std::vector<int8_t*> slabs_;  /// vector of beginning memory addresses for each
                                 /// allocation of the buffer pool
@@ -206,7 +210,8 @@ class BufferMgr : public AbstractBufferMgr {  // implements
   size_t num_pages_allocated_;
   size_t min_num_pages_per_slab_;
   size_t max_num_pages_per_slab_;
-  size_t current_max_slab_page_size_;
+  size_t default_num_pages_per_slab_;
+  size_t current_max_num_pages_per_slab_;
   bool allocations_capped_;
   AbstractBufferMgr* parent_mgr_;
   int max_buffer_id_;
diff --git a/DataMgr/BufferMgr/CpuBufferMgr/CpuBufferMgr.cpp b/DataMgr/BufferMgr/CpuBufferMgr/CpuBufferMgr.cpp
index 225e20794f..11ee26e516 100644
--- a/DataMgr/BufferMgr/CpuBufferMgr/CpuBufferMgr.cpp
+++ b/DataMgr/BufferMgr/CpuBufferMgr/CpuBufferMgr.cpp
@@ -56,7 +56,7 @@ void CpuBufferMgr::allocateBuffer(BufferList::iterator seg_it,
 }
 
 void CpuBufferMgr::initializeMem() {
-  allocator_.reset(new DramArena(max_slab_size_ + kArenaBlockOverhead));
+  allocator_.reset(new DramArena(default_slab_size_ + kArenaBlockOverhead));
 }
 
 }  // namespace Buffer_Namespace
diff --git a/DataMgr/BufferMgr/CpuBufferMgr/CpuBufferMgr.h b/DataMgr/BufferMgr/CpuBufferMgr/CpuBufferMgr.h
index 75f5f06048..4ec455f33c 100644
--- a/DataMgr/BufferMgr/CpuBufferMgr/CpuBufferMgr.h
+++ b/DataMgr/BufferMgr/CpuBufferMgr/CpuBufferMgr.h
@@ -33,12 +33,14 @@ class CpuBufferMgr : public BufferMgr {
                CudaMgr_Namespace::CudaMgr* cuda_mgr,
                const size_t min_slab_size,
                const size_t max_slab_size,
+               const size_t default_slab_size,
                const size_t page_size,
                AbstractBufferMgr* parent_mgr = nullptr)
       : BufferMgr(device_id,
                   max_buffer_pool_size,
                   min_slab_size,
                   max_slab_size,
+                  default_slab_size,
                   page_size,
                   parent_mgr)
       , cuda_mgr_(cuda_mgr) {
diff --git a/DataMgr/BufferMgr/CpuBufferMgr/TieredCpuBufferMgr.cpp b/DataMgr/BufferMgr/CpuBufferMgr/TieredCpuBufferMgr.cpp
index d9a51d71c4..4df01223e0 100644
--- a/DataMgr/BufferMgr/CpuBufferMgr/TieredCpuBufferMgr.cpp
+++ b/DataMgr/BufferMgr/CpuBufferMgr/TieredCpuBufferMgr.cpp
@@ -43,6 +43,7 @@ TieredCpuBufferMgr::TieredCpuBufferMgr(const int device_id,
                                        CudaMgr_Namespace::CudaMgr* cuda_mgr,
                                        const size_t min_slab_size,
                                        const size_t max_slab_size,
+                                       const size_t default_slab_size,
                                        const size_t page_size,
                                        const CpuTierSizeVector& cpu_tier_sizes,
                                        AbstractBufferMgr* parent_mgr)
@@ -51,14 +52,15 @@ TieredCpuBufferMgr::TieredCpuBufferMgr(const int device_id,
                    cuda_mgr,
                    min_slab_size,
                    max_slab_size,
+                   default_slab_size,
                    page_size,
                    parent_mgr) {
   CHECK(cpu_tier_sizes.size() == numCpuTiers);
   allocators_.emplace_back(
-      std::make_unique<DramArena>(max_slab_size_ + kArenaBlockOverhead),
+      std::make_unique<DramArena>(default_slab_size_ + kArenaBlockOverhead),
       cpu_tier_sizes[CpuTier::DRAM]);
   allocators_.emplace_back(
-      std::make_unique<PMemArena>(max_slab_size_ + kArenaBlockOverhead),
+      std::make_unique<PMemArena>(default_slab_size_ + kArenaBlockOverhead),
       cpu_tier_sizes[CpuTier::PMEM]);
 }
 
@@ -111,9 +113,9 @@ void TieredCpuBufferMgr::freeAllMem() {
 
 void TieredCpuBufferMgr::initializeMem() {
   allocators_[CpuTier::DRAM].first =
-      std::make_unique<DramArena>(max_slab_size_ + kArenaBlockOverhead);
+      std::make_unique<DramArena>(default_slab_size_ + kArenaBlockOverhead);
   allocators_[CpuTier::PMEM].first =
-      std::make_unique<PMemArena>(max_slab_size_ + kArenaBlockOverhead);
+      std::make_unique<PMemArena>(default_slab_size_ + kArenaBlockOverhead);
   slab_to_allocator_map_.clear();
 }
 
diff --git a/DataMgr/BufferMgr/CpuBufferMgr/TieredCpuBufferMgr.h b/DataMgr/BufferMgr/CpuBufferMgr/TieredCpuBufferMgr.h
index f8ec99a250..df73353b1d 100644
--- a/DataMgr/BufferMgr/CpuBufferMgr/TieredCpuBufferMgr.h
+++ b/DataMgr/BufferMgr/CpuBufferMgr/TieredCpuBufferMgr.h
@@ -37,6 +37,7 @@ class TieredCpuBufferMgr : public CpuBufferMgr {
                      CudaMgr_Namespace::CudaMgr* cuda_mgr,
                      const size_t min_slab_size,
                      const size_t max_slab_size,
+                     const size_t default_slab_size,
                      const size_t page_size,
                      const CpuTierSizeVector& cpu_tier_sizes,
                      AbstractBufferMgr* parent_mgr = nullptr);
diff --git a/DataMgr/BufferMgr/GpuCudaBufferMgr/GpuCudaBufferMgr.cpp b/DataMgr/BufferMgr/GpuCudaBufferMgr/GpuCudaBufferMgr.cpp
index e4f474493a..c0c137272d 100644
--- a/DataMgr/BufferMgr/GpuCudaBufferMgr/GpuCudaBufferMgr.cpp
+++ b/DataMgr/BufferMgr/GpuCudaBufferMgr/GpuCudaBufferMgr.cpp
@@ -27,12 +27,14 @@ GpuCudaBufferMgr::GpuCudaBufferMgr(const int device_id,
                                    CudaMgr_Namespace::CudaMgr* cuda_mgr,
                                    const size_t min_slab_size,
                                    const size_t max_slab_size,
+                                   const size_t default_slab_size,
                                    const size_t page_size,
                                    AbstractBufferMgr* parent_mgr)
     : BufferMgr(device_id,
                 max_buffer_pool_size,
                 min_slab_size,
                 max_slab_size,
+                default_slab_size,
                 page_size,
                 parent_mgr)
     , cuda_mgr_(cuda_mgr) {}
diff --git a/DataMgr/BufferMgr/GpuCudaBufferMgr/GpuCudaBufferMgr.h b/DataMgr/BufferMgr/GpuCudaBufferMgr/GpuCudaBufferMgr.h
index fca9871061..f632e02073 100644
--- a/DataMgr/BufferMgr/GpuCudaBufferMgr/GpuCudaBufferMgr.h
+++ b/DataMgr/BufferMgr/GpuCudaBufferMgr/GpuCudaBufferMgr.h
@@ -31,6 +31,7 @@ class GpuCudaBufferMgr : public BufferMgr {
                    CudaMgr_Namespace::CudaMgr* cuda_mgr,
                    const size_t min_slab_size,
                    const size_t max_slab_size,
+                   const size_t default_slab_size,
                    const size_t page_size,
                    AbstractBufferMgr* parent_mgr = 0);
   inline MgrType getMgrType() override { return GPU_MGR; }
diff --git a/DataMgr/CMakeLists.txt b/DataMgr/CMakeLists.txt
index 027524ab0f..dd31d12c72 100644
--- a/DataMgr/CMakeLists.txt
+++ b/DataMgr/CMakeLists.txt
@@ -4,6 +4,7 @@ endif()
 
 set(datamgr_source_files
     AbstractBuffer.cpp
+    Allocators/CpuMgrArenaAllocator.cpp
     Allocators/CudaAllocator.cpp
     Allocators/ThrustAllocator.cpp
     Chunk/Chunk.cpp
diff --git a/DataMgr/DataMgr.cpp b/DataMgr/DataMgr.cpp
index 741dc81069..f3830ff87a 100644
--- a/DataMgr/DataMgr.cpp
+++ b/DataMgr/DataMgr.cpp
@@ -34,10 +34,16 @@
 #include <sys/types.h>
 #endif
 
+#include <boost/container/small_vector.hpp>
 #include <boost/filesystem.hpp>
 
 #include <algorithm>
+#include <cctype>
+#include <charconv>
+#include <fstream>
 #include <limits>
+#include <numeric>
+#include <string_view>
 
 extern bool g_enable_fsi;
 
@@ -47,6 +53,8 @@ std::string g_pmem_path{};
 size_t g_pmem_size{0};
 #endif
 
+bool g_use_cpu_mem_pool_size_for_max_cpu_slab_size{false};
+
 namespace Data_Namespace {
 
 namespace {
@@ -158,8 +166,11 @@ DataMgr::SystemMemoryUsage DataMgr::getSystemMemoryUsage() const {
   usage.regular = (resident - shared) * page_size;
   usage.shared = shared * page_size;
 
-  ProcBuddyinfoParser bi;
+  ProcBuddyinfoParser bi{};
+  bi.parseBuddyinfo();
   usage.frag = bi.getFragmentationPercent();
+  usage.avail_pages = bi.getSumAvailPages();
+  usage.high_blocks = bi.getSumHighestBlocks();
 
 #else
 
@@ -169,7 +180,9 @@ DataMgr::SystemMemoryUsage DataMgr::getSystemMemoryUsage() const {
   usage.vtotal = 0;
   usage.regular = 0;
   usage.shared = 0;
-  usage.frag = 0;
+  usage.frag = 0.0;
+  usage.avail_pages = 0;
+  usage.high_blocks = 0;
 
 #endif
 
@@ -201,20 +214,23 @@ size_t DataMgr::getTotalSystemMemory() {
 
 void DataMgr::allocateCpuBufferMgr(int32_t device_id,
                                    size_t total_cpu_size,
-                                   size_t minCpuSlabSize,
-                                   size_t maxCpuSlabSize,
+                                   size_t min_cpu_slab_size,
+                                   size_t max_cpu_slab_size,
+                                   size_t default_cpu_slab_size,
                                    size_t page_size,
                                    const CpuTierSizeVector& cpu_tier_sizes) {
 #ifdef ENABLE_MEMKIND
   if (g_enable_tiered_cpu_mem) {
-    bufferMgrs_[1].push_back(new Buffer_Namespace::TieredCpuBufferMgr(0,
-                                                                      total_cpu_size,
-                                                                      cudaMgr_.get(),
-                                                                      minCpuSlabSize,
-                                                                      maxCpuSlabSize,
-                                                                      page_size,
-                                                                      cpu_tier_sizes,
-                                                                      bufferMgrs_[0][0]));
+    bufferMgrs_[1].push_back(
+        new Buffer_Namespace::TieredCpuBufferMgr(0,
+                                                 total_cpu_size,
+                                                 cudaMgr_.get(),
+                                                 min_cpu_slab_size,
+                                                 max_cpu_slab_size,
+                                                 default_cpu_slab_size,
+                                                 page_size,
+                                                 cpu_tier_sizes,
+                                                 bufferMgrs_[0][0]));
     return;
   }
 #endif
@@ -222,8 +238,9 @@ void DataMgr::allocateCpuBufferMgr(int32_t device_id,
   bufferMgrs_[1].push_back(new Buffer_Namespace::CpuBufferMgr(0,
                                                               total_cpu_size,
                                                               cudaMgr_.get(),
-                                                              minCpuSlabSize,
-                                                              maxCpuSlabSize,
+                                                              min_cpu_slab_size,
+                                                              max_cpu_slab_size,
+                                                              default_cpu_slab_size,
                                                               page_size,
                                                               bufferMgrs_[0][0]));
 }
@@ -243,6 +260,16 @@ void DataMgr::resetBufferMgrs(const File_Namespace::DiskCacheConfig& cache_confi
   createTopLevelMetadata();
 }
 
+namespace {
+size_t get_slab_size(size_t initial_slab_size,
+                     size_t buffer_pool_size,
+                     size_t page_size) {
+  auto slab_size = std::min(initial_slab_size, buffer_pool_size);
+  slab_size = (slab_size / page_size) * page_size;
+  return slab_size;
+}
+}  // namespace
+
 void DataMgr::populateMgrs(const SystemParameters& system_parameters,
                            const size_t userSpecifiedNumReaderThreads,
                            const File_Namespace::DiskCacheConfig& cache_config) {
@@ -254,22 +281,31 @@ void DataMgr::populateMgrs(const SystemParameters& system_parameters,
   levelSizes_.push_back(1);
   auto page_size = system_parameters.buffer_page_size;
   CHECK_GT(page_size, size_t(0));
-  size_t cpuBufferSize = system_parameters.cpu_buffer_mem_bytes;
-  if (cpuBufferSize == 0) {  // if size is not specified
+  auto cpu_buffer_size = system_parameters.cpu_buffer_mem_bytes;
+  if (cpu_buffer_size == 0) {  // if size is not specified
     const auto total_system_memory = getTotalSystemMemory();
     VLOG(1) << "Detected " << (float)total_system_memory / (1024 * 1024)
             << "M of total system memory.";
-    cpuBufferSize = total_system_memory *
-                    0.8;  // should get free memory instead of this ugly heuristic
+    cpu_buffer_size = total_system_memory *
+                      0.8;  // should get free memory instead of this ugly heuristic
   }
-  size_t minCpuSlabSize = std::min(system_parameters.min_cpu_slab_size, cpuBufferSize);
-  minCpuSlabSize = (minCpuSlabSize / page_size) * page_size;
-  size_t maxCpuSlabSize = std::min(system_parameters.max_cpu_slab_size, cpuBufferSize);
-  maxCpuSlabSize = (maxCpuSlabSize / page_size) * page_size;
-  LOG(INFO) << "Min CPU Slab Size is " << (float)minCpuSlabSize / (1024 * 1024) << "MB";
-  LOG(INFO) << "Max CPU Slab Size is " << (float)maxCpuSlabSize / (1024 * 1024) << "MB";
-  LOG(INFO) << "Max memory pool size for CPU is " << (float)cpuBufferSize / (1024 * 1024)
+  auto min_cpu_slab_size =
+      get_slab_size(system_parameters.min_cpu_slab_size, cpu_buffer_size, page_size);
+  auto max_cpu_slab_size =
+      g_use_cpu_mem_pool_size_for_max_cpu_slab_size
+          ? cpu_buffer_size
+          : get_slab_size(
+                system_parameters.max_cpu_slab_size, cpu_buffer_size, page_size);
+  auto default_cpu_slab_size =
+      get_slab_size(system_parameters.default_cpu_slab_size, cpu_buffer_size, page_size);
+  LOG(INFO) << "Min CPU Slab Size is " << float(min_cpu_slab_size) / (1024 * 1024)
             << "MB";
+  LOG(INFO) << "Max CPU Slab Size is " << float(max_cpu_slab_size) / (1024 * 1024)
+            << "MB";
+  LOG(INFO) << "Default CPU Slab Size is " << float(default_cpu_slab_size) / (1024 * 1024)
+            << "MB";
+  LOG(INFO) << "Max memory pool size for CPU is "
+            << float(cpu_buffer_size) / (1024 * 1024) << "MB";
 
   size_t total_cpu_size = 0;
 
@@ -286,47 +322,61 @@ void DataMgr::populateMgrs(const SystemParameters& system_parameters,
   }
 #else
   CpuTierSizeVector cpu_tier_sizes{};
-  total_cpu_size = cpuBufferSize;
+  total_cpu_size = cpu_buffer_size;
 #endif
 
   if (hasGpus_ || cudaMgr_) {
     LOG(INFO) << "Reserved GPU memory is " << (float)reservedGpuMem_ / (1024 * 1024)
               << "MB includes render buffer allocation";
     bufferMgrs_.resize(3);
-    allocateCpuBufferMgr(
-        0, total_cpu_size, minCpuSlabSize, maxCpuSlabSize, page_size, cpu_tier_sizes);
+    allocateCpuBufferMgr(0,
+                         total_cpu_size,
+                         min_cpu_slab_size,
+                         max_cpu_slab_size,
+                         default_cpu_slab_size,
+                         page_size,
+                         cpu_tier_sizes);
 
     levelSizes_.push_back(1);
-    int numGpus = cudaMgr_->getDeviceCount();
-    for (int gpuNum = 0; gpuNum < numGpus; ++gpuNum) {
-      size_t gpuMaxMemSize =
+    auto num_gpus = cudaMgr_->getDeviceCount();
+    for (int gpu_num = 0; gpu_num < num_gpus; ++gpu_num) {
+      auto gpu_max_mem_size =
           system_parameters.gpu_buffer_mem_bytes != 0
               ? system_parameters.gpu_buffer_mem_bytes
-              : (cudaMgr_->getDeviceProperties(gpuNum)->globalMem) - (reservedGpuMem_);
-      size_t minGpuSlabSize =
-          std::min(system_parameters.min_gpu_slab_size, gpuMaxMemSize);
-      minGpuSlabSize = (minGpuSlabSize / page_size) * page_size;
-      size_t maxGpuSlabSize =
-          std::min(system_parameters.max_gpu_slab_size, gpuMaxMemSize);
-      maxGpuSlabSize = (maxGpuSlabSize / page_size) * page_size;
-      LOG(INFO) << "Min GPU Slab size for GPU " << gpuNum << " is "
-                << (float)minGpuSlabSize / (1024 * 1024) << "MB";
-      LOG(INFO) << "Max GPU Slab size for GPU " << gpuNum << " is "
-                << (float)maxGpuSlabSize / (1024 * 1024) << "MB";
-      LOG(INFO) << "Max memory pool size for GPU " << gpuNum << " is "
-                << (float)gpuMaxMemSize / (1024 * 1024) << "MB";
-      bufferMgrs_[2].push_back(new Buffer_Namespace::GpuCudaBufferMgr(gpuNum,
-                                                                      gpuMaxMemSize,
-                                                                      cudaMgr_.get(),
-                                                                      minGpuSlabSize,
-                                                                      maxGpuSlabSize,
-                                                                      page_size,
-                                                                      bufferMgrs_[1][0]));
+              : (cudaMgr_->getDeviceProperties(gpu_num)->globalMem) - (reservedGpuMem_);
+      auto min_gpu_slab_size =
+          get_slab_size(system_parameters.min_gpu_slab_size, gpu_max_mem_size, page_size);
+      auto max_gpu_slab_size =
+          get_slab_size(system_parameters.max_gpu_slab_size, gpu_max_mem_size, page_size);
+      auto default_gpu_slab_size = get_slab_size(
+          system_parameters.default_gpu_slab_size, gpu_max_mem_size, page_size);
+      LOG(INFO) << "Min GPU Slab size for GPU " << gpu_num << " is "
+                << float(min_gpu_slab_size) / (1024 * 1024) << "MB";
+      LOG(INFO) << "Max GPU Slab size for GPU " << gpu_num << " is "
+                << float(max_gpu_slab_size) / (1024 * 1024) << "MB";
+      LOG(INFO) << "Default GPU Slab size for GPU " << gpu_num << " is "
+                << float(default_gpu_slab_size) / (1024 * 1024) << "MB";
+      LOG(INFO) << "Max memory pool size for GPU " << gpu_num << " is "
+                << float(gpu_max_mem_size) / (1024 * 1024) << "MB";
+      bufferMgrs_[2].push_back(
+          new Buffer_Namespace::GpuCudaBufferMgr(gpu_num,
+                                                 gpu_max_mem_size,
+                                                 cudaMgr_.get(),
+                                                 min_gpu_slab_size,
+                                                 max_gpu_slab_size,
+                                                 default_gpu_slab_size,
+                                                 page_size,
+                                                 bufferMgrs_[1][0]));
     }
-    levelSizes_.push_back(numGpus);
+    levelSizes_.push_back(num_gpus);
   } else {
-    allocateCpuBufferMgr(
-        0, total_cpu_size, minCpuSlabSize, maxCpuSlabSize, page_size, cpu_tier_sizes);
+    allocateCpuBufferMgr(0,
+                         total_cpu_size,
+                         min_cpu_slab_size,
+                         max_cpu_slab_size,
+                         default_cpu_slab_size,
+                         page_size,
+                         cpu_tier_sizes);
     levelSizes_.push_back(1);
   }
 }
@@ -670,6 +720,8 @@ std::ostream& operator<<(std::ostream& os, const DataMgr::SystemMemoryUsage& mem
   os << " \"ProcessPlusSwapMB\": " << mem_info.regular / (1024. * 1024.) << ",";
   os << " \"ProcessSharedMB\": " << mem_info.shared / (1024. * 1024.) << ",";
   os << " \"FragmentationPercent\": " << mem_info.frag;
+  os << ", \"BuddyinfoHighBlocks\": " << mem_info.high_blocks;
+  os << ", \"BuddyinfoAvailPages\": " << mem_info.avail_pages;
   os << " }";
   return os;
 }
@@ -711,4 +763,117 @@ Buffer_Namespace::GpuCudaBufferMgr* DataMgr::getGpuBufferMgr(int32_t device_id)
   }
 }
 
+namespace {
+constexpr unsigned kMaxBuddyinfoBlocks = 32;
+constexpr unsigned kMaxBuddyinfoTokens = kMaxBuddyinfoBlocks + 4;
+constexpr double kErrorCodeUnableToOpenFile = -1.0;
+constexpr double kErrorCodeOutOfMemory = -2.0;
+template <typename T, std::size_t N>
+using small_vector = boost::container::small_vector<T, N>;
+
+struct BuddyinfoBlocks {
+  small_vector<size_t, kMaxBuddyinfoBlocks> blocks;
+
+  // Sum total pages in BuddyinfoBlocks when iterated in reverse using Horner's method.
+  struct Horner {
+    size_t operator()(size_t sum, size_t blocks) const { return 2 * sum + blocks; }
+  };
+
+  BuddyinfoBlocks() = default;
+
+  // Set blocks from array of string_view tokens.
+  BuddyinfoBlocks(std::string_view const* const tokens, size_t const num_blocks) {
+    for (size_t i = 0; i < num_blocks; ++i) {
+      size_t block;
+      std::from_chars(tokens[i].data(), tokens[i].data() + tokens[i].size(), block);
+      blocks.push_back(block);
+    }
+  }
+
+  void addBlocks(BuddyinfoBlocks const& rhs) {
+    if (blocks.size() < rhs.blocks.size()) {
+      blocks.resize(rhs.blocks.size(), 0u);
+    }
+    for (size_t i = 0; i < rhs.blocks.size(); ++i) {
+      blocks[i] += rhs.blocks[i];
+    }
+  }
+
+  double fragPercent() const {
+    if (blocks.size() < 2u) {
+      return 0.0;  // No fragmentation is possible with only one block column.
+    }
+    size_t scaled = 0;
+    size_t total = 0;
+    for (size_t order = 0; order < blocks.size(); ++order) {
+      size_t const pages = blocks[order] << order;
+      scaled += pages * (blocks.size() - 1 - order) / (blocks.size() - 1);
+      total += pages;
+    }
+    return total ? scaled * 100.0 / total : kErrorCodeOutOfMemory;
+  }
+
+  size_t highestBlock() const { return blocks.empty() ? 0 : blocks.back(); }
+
+  size_t sumAvailPages() const {
+    return std::accumulate(blocks.rbegin(), blocks.rend(), size_t(0), Horner{});
+  }
+};
+
+// Split line on spaces into string_views.
+small_vector<std::string_view, kMaxBuddyinfoTokens> tokenize(std::string_view const str) {
+  small_vector<std::string_view, kMaxBuddyinfoTokens> tokens;
+  size_t start = 0;
+  while (start < str.size()) {
+    // Find the start of the next token
+    start = str.find_first_not_of(' ', start);
+    // Check if we're at the end
+    if (start == std::string_view::npos) {
+      break;
+    }
+    // Find the end of the token. std::string_view::npos is ok.
+    size_t end = str.find(' ', start);
+    tokens.push_back(str.substr(start, end - start));  // Add the token to our list
+    start = end;                                       // Set up for the next token
+  }
+  return tokens;
+}
+
+}  // namespace
+
+// Each row of /proc/buddyinfo is parsed into a BuddyinfoBlocks struct,
+// from which the member variables are calculated.
+void ProcBuddyinfoParser::parseBuddyinfo() {
+  std::ifstream file("/proc/buddyinfo");
+  if (!file.is_open()) {
+    frag_percent_ = kErrorCodeUnableToOpenFile;
+    sum_highest_blocks_ = 0;
+    return;
+  }
+
+  constexpr unsigned max_line_size = 256;
+  char line[max_line_size];
+
+  BuddyinfoBlocks frag;  // Used to calculate frag_percent_.
+
+  // Example: line = "Node 0, zone Normal 1 2 3 4 5 6 7 8 9 10 11"
+  // No CHECKs are done, and no exceptions are thrown. The worst that can happen is
+  // bad logs, which is not worth crashing the server or showing an error to the user.
+  while (file.getline(line, max_line_size)) {
+    auto tokens = tokenize(line);  // Split on spaces.
+    // Sanity check on tokens.size() and known tokens.
+    if (5u <= tokens.size() && tokens[0] == "Node" && tokens[2] == "zone") {
+      BuddyinfoBlocks row(tokens.data() + 4, tokens.size() - 4);
+
+      // Calculate member variables
+      frag.addBlocks(row);
+      if (tokens[3].substr(0, 3) != "DMA") {
+        sum_avail_pages_ += row.sumAvailPages();
+        sum_highest_blocks_ += row.highestBlock();
+      }
+    }
+  }
+  frag_percent_ = frag.fragPercent();
+}
+
 }  // namespace Data_Namespace
diff --git a/DataMgr/DataMgr.h b/DataMgr/DataMgr.h
index c24995dc75..81351ca829 100644
--- a/DataMgr/DataMgr.h
+++ b/DataMgr/DataMgr.h
@@ -108,76 +108,18 @@ class ProcMeminfoParser {
   auto end() { return items_.end(); }
 };
 
-//! Parse /proc/buddyinfo into a Fragmentation health score.
+//! Parse /proc/buddyinfo into a few fragmentation-related data.
 class ProcBuddyinfoParser {
-  std::string inputText_;
-  std::vector<size_t> orders_;
-  size_t fragmentationPercent_;
-
  public:
-  ProcBuddyinfoParser(std::string text = {}) {
-    if (text.empty()) {
-      std::ifstream f("/proc/buddyinfo");
-      std::stringstream ss;
-      ss << f.rdbuf();
-      text = ss.str();
-    }
-    inputText_ = text;
-
-    const size_t skipped_columns = 4;
-    // NOTE(sy): For now this calculation ignores the first four buddyinfo columns,
-    // but in the future we could break out subscores by node and/or by zone.
-    size_t number_of_columns = 0;
-    for (const std::string& line : split(text, "\n")) {
-      if (line.empty()) {
-        continue;
-      }
-      const auto columns = split(line);
-      CHECK_GT(columns.size(), skipped_columns) << "unexpected line format: " << line;
-      if (number_of_columns != 0) {
-        CHECK_EQ(columns.size(), number_of_columns)
-            << "expected line to have " << number_of_columns << " columns: " << line;
-      } else {
-        number_of_columns = columns.size();
-        orders_.resize(number_of_columns - skipped_columns, 0);
-      }
-      for (size_t i = skipped_columns; i < number_of_columns; ++i) {
-        orders_[i - skipped_columns] += strtoull(columns[i].c_str(), NULL, 10);
-      }
-    }
-#ifdef __linux__
-    const long page_size =
-        sysconf(_SC_PAGE_SIZE);  // in case x86-64 is configured to use 2MB pages
-#else
-    const long page_size = heavyai::get_page_size();
-#endif
-    size_t scaled = 0;
-    size_t total = 0;
-    for (size_t order = 0; order < orders_.size(); ++order) {
-      const size_t bytes = orders_[order] * (size_t(1) << order) * page_size;
-      scaled += (bytes * (orders_.size() - 1 - order)) / (orders_.size() - 1);
-      total += bytes;
-    }
-
-    CHECK_GT(total, size_t(0)) << "failed to parse:\n" << text;
-    fragmentationPercent_ = (scaled * 100) / total;
-  }
+  void parseBuddyinfo();  // Set member variables.
+  double getFragmentationPercent() const { return frag_percent_; }
+  size_t getSumAvailPages() const { return sum_avail_pages_; }
+  size_t getSumHighestBlocks() const { return sum_highest_blocks_; }
 
-  auto operator[](size_t order) {
-    return orders_[order];
-  }
-  auto begin() {
-    return orders_.begin();
-  }
-  auto end() {
-    return orders_.end();
-  }
-  auto getFragmentationPercent() {
-    return fragmentationPercent_;
-  }
-  auto getInputText() {
-    return inputText_;
-  }
+ private:
+  double frag_percent_{0};        // Weighted score of available memory.
+  size_t sum_avail_pages_{0};     // Sum of all available non-DMA pages.
+  size_t sum_highest_blocks_{0};  // Sum of highest non-DMA blocks.
 };
 
 class DataMgr {
@@ -243,13 +185,15 @@ class DataMgr {
   // NOTE(sy): Revisit how DataMgr should handle Cuda streams if Intel ever needs this.
 
   struct SystemMemoryUsage {
-    size_t free;      // available CPU RAM memory in bytes
-    size_t total;     // total CPU RAM memory in bytes
-    size_t resident;  // resident process memory in bytes
-    size_t vtotal;    // total process virtual memory in bytes
-    size_t regular;   // process bytes non-shared
-    size_t shared;    // process bytes shared (file maps + shmem)
-    size_t frag;      // fragmentation percent
+    size_t free;         // available CPU RAM memory in bytes
+    size_t total;        // total CPU RAM memory in bytes
+    size_t resident;     // resident process memory in bytes
+    size_t vtotal;       // total process virtual memory in bytes
+    size_t regular;      // process bytes non-shared
+    size_t shared;       // process bytes shared (file maps + shmem)
+    double frag;         // fragmentation percent
+    size_t avail_pages;  // sum of all non-dma pages in /proc/buddyinfo
+    size_t high_blocks;  // sum of highest non-dma blocks in /proc/buddyinfo
   };
 
   SystemMemoryUsage getSystemMemoryUsage() const;
@@ -280,8 +224,9 @@ class DataMgr {
   void createTopLevelMetadata() const;
   void allocateCpuBufferMgr(int32_t device_id,
                             size_t total_cpu_size,
-                            size_t minCpuSlabSize,
-                            size_t maxCpuSlabSize,
+                            size_t min_cpu_slab_size,
+                            size_t max_cpu_slab_size,
+                            size_t default_cpu_slab_size,
                             size_t page_size,
                             const std::vector<size_t>& cpu_tier_sizes);
 
diff --git a/DataMgr/FileMgr/FileBuffer.cpp b/DataMgr/FileMgr/FileBuffer.cpp
index 085a6cfbd1..de0a0bb5dd 100644
--- a/DataMgr/FileMgr/FileBuffer.cpp
+++ b/DataMgr/FileMgr/FileBuffer.cpp
@@ -46,7 +46,7 @@ FileBuffer::FileBuffer(FileMgr* fm,
     , chunkKey_(chunkKey) {
   // Create a new FileBuffer
   CHECK(fm_);
-  calcHeaderBuffer();
+  setBufferHeaderSize();
   CHECK_GT(pageSize_, reservedHeaderSize_);
   pageDataSize_ = pageSize_ - reservedHeaderSize_;
   //@todo reintroduce initialSize - need to develop easy way of
@@ -76,7 +76,7 @@ FileBuffer::FileBuffer(FileMgr* fm,
     , pageSize_(pageSize)
     , chunkKey_(chunkKey) {
   CHECK(fm_);
-  calcHeaderBuffer();
+  setBufferHeaderSize();
   pageDataSize_ = pageSize_ - reservedHeaderSize_;
 }
 
@@ -93,7 +93,7 @@ FileBuffer::FileBuffer(FileMgr* fm,
   // We are being assigned an existing FileBuffer on disk
 
   CHECK(fm_);
-  calcHeaderBuffer();
+  setBufferHeaderSize();
   int32_t lastPageId = -1;
   int32_t curPageId = 0;
   for (auto vecIt = headerStartIt; vecIt != headerEndIt; ++vecIt) {
@@ -142,14 +142,25 @@ void FileBuffer::reserve(const size_t numBytes) {
   }
 }
 
-void FileBuffer::calcHeaderBuffer() {
-  // 3 * sizeof(int32_t) is for headerSize, for pageId and versionEpoch
-  // sizeof(size_t) is for chunkSize
-  reservedHeaderSize_ = (chunkKey_.size() + 3) * sizeof(int32_t);
-  size_t headerMod = reservedHeaderSize_ % headerBufferOffset_;
-  if (headerMod > 0) {
-    reservedHeaderSize_ += headerBufferOffset_ - headerMod;
+namespace {
+size_t calculate_buffer_header_size(size_t chunk_size) {
+  // Additional 3 * sizeof(int32_t) is for headerSize, pageId, and versionEpoch
+  size_t header_size = (chunk_size + 3) * sizeof(int32_t);
+  size_t header_mod = header_size % FileBuffer::kHeaderBufferOffset;
+  if (header_mod > 0) {
+    header_size += FileBuffer::kHeaderBufferOffset - header_mod;
   }
+  return header_size;
+}
+}  // namespace
+
+void FileBuffer::setBufferHeaderSize() {
+  reservedHeaderSize_ = calculate_buffer_header_size(chunkKey_.size());
+}
+
+size_t FileBuffer::getMinPageSize() {
+  constexpr size_t max_chunk_size{5};
+  return calculate_buffer_header_size(max_chunk_size) + 1;
 }
 
 void FileBuffer::freePage(const Page& page) {
diff --git a/DataMgr/FileMgr/FileBuffer.h b/DataMgr/FileMgr/FileBuffer.h
index e82c78ea43..b066024e54 100644
--- a/DataMgr/FileMgr/FileBuffer.h
+++ b/DataMgr/FileMgr/FileBuffer.h
@@ -159,10 +159,12 @@ class FileBuffer : public AbstractBuffer {
   size_t numChunkPages() const;
   std::string dump() const;
 
+  static size_t getMinPageSize();
+
   // Used for testing
   void freePage(const Page& page);
 
-  static constexpr size_t headerBufferOffset_ = 32;
+  static constexpr size_t kHeaderBufferOffset{32};
 
  private:
   // FileBuffer(const FileBuffer&);      // private copy constructor
@@ -178,7 +180,7 @@ class FileBuffer : public AbstractBuffer {
                    const bool writeMetadata = false);
   void writeMetadata(const int32_t epoch);
   void readMetadata(const Page& page);
-  void calcHeaderBuffer();
+  void setBufferHeaderSize();
 
   void freePage(const Page& page, const bool isRolloff);
   void freePagesBeforeEpochForMultiPage(MultiPage& multiPage,
diff --git a/DataMgr/ForeignStorage/InternalCatalogDataWrapper.cpp b/DataMgr/ForeignStorage/InternalCatalogDataWrapper.cpp
index aec87466bd..735bd331d9 100644
--- a/DataMgr/ForeignStorage/InternalCatalogDataWrapper.cpp
+++ b/DataMgr/ForeignStorage/InternalCatalogDataWrapper.cpp
@@ -464,7 +464,7 @@ std::map<int32_t, std::vector<DashboardDescriptor>> get_all_dashboards() {
   auto& sys_catalog = Catalog_Namespace::SysCatalog::instance();
   for (const auto& catalog : sys_catalog.getCatalogsForAllDbs()) {
     if (catalog->name() != shared::kInfoSchemaDbName) {
-      for (const auto& dashboard : catalog->getAllDashboardsMetadataCopy()) {
+      for (const auto& dashboard : catalog->getAllDashboardsMetadataForSysTable()) {
         dashboards_by_database[catalog->getDatabaseId()].emplace_back(dashboard);
       }
     }
diff --git a/DataMgr/ForeignStorage/LazyParquetChunkLoader.cpp b/DataMgr/ForeignStorage/LazyParquetChunkLoader.cpp
index 6d519901c1..3595f880cf 100644
--- a/DataMgr/ForeignStorage/LazyParquetChunkLoader.cpp
+++ b/DataMgr/ForeignStorage/LazyParquetChunkLoader.cpp
@@ -44,6 +44,7 @@
 #include "ParquetTimeEncoder.h"
 #include "ParquetTimestampEncoder.h"
 #include "ParquetVariableLengthArrayEncoder.h"
+#include "Shared/measure.h"
 #include "Shared/misc.h"
 #include "StringDictionary/StringDictionary.h"
 #include "TypedParquetDetectBuffer.h"
@@ -1482,7 +1483,10 @@ SQLTypeInfo suggest_string_mapping(const parquet::ColumnDescriptor* parquet_colu
   SQLTypeInfo type;
   type.set_type(kTEXT);
   type.set_compression(kENCODING_DICT);
-  type.set_comp_param(32);
+  type.set_comp_param(0);  // `comp_param` is expected either to be zero or
+                           // equal to a string dictionary id in some code
+                           // paths, since we don't have a string dictionary we
+                           // set this to zero
   type.set_fixed_size();
   return type;
 }
@@ -1829,7 +1833,7 @@ std::list<std::unique_ptr<ChunkMetadata>> LazyParquetChunkLoader::appendRowGroup
     StringDictionary* string_dictionary,
     RejectedRowIndices* rejected_row_indices,
     const bool is_for_detect,
-    const std::optional<int64_t> max_levels_read) {
+    const std::optional<int64_t> max_rows_to_read) {
   auto timer = DEBUG_TIMER(__func__);
   std::list<std::unique_ptr<ChunkMetadata>> chunk_metadata;
   // `def_levels` and `rep_levels` below are used to store the read definition
@@ -1839,6 +1843,17 @@ std::list<std::unique_ptr<ChunkMetadata>> LazyParquetChunkLoader::appendRowGroup
   std::vector<int16_t> rep_levels(LazyParquetChunkLoader::batch_reader_num_elements);
   std::vector<int8_t> values;
 
+  // Timing information used in logging
+  Timer<> summary_timer;
+  Timer<> initialization_timer_ms;
+  Timer<> validation_timer_ms;
+  Timer<> parquet_read_timer_ms;
+  Timer<> encoding_timer_ms;
+  size_t total_row_groups_read = 0;
+
+  summary_timer.start();
+
+  initialization_timer_ms.start();
   CHECK(!row_group_intervals.empty());
   const auto& first_file_path = row_group_intervals.front().file_path;
 
@@ -1864,10 +1879,12 @@ std::list<std::unique_ptr<ChunkMetadata>> LazyParquetChunkLoader::appendRowGroup
     encoder->initializeErrorTracking();
   }
   encoder->initializeColumnType(column_descriptor->columnType);
+  initialization_timer_ms.stop();
 
   bool early_exit = false;
-  int64_t total_levels_read = 0;
+  int64_t total_rows_read = 0;
   for (const auto& row_group_interval : row_group_intervals) {
+    initialization_timer_ms.start();
     const auto& file_path = row_group_interval.file_path;
     auto file_reader = file_reader_cache_->getOrInsert(file_path, file_system_);
 
@@ -1879,6 +1896,10 @@ std::list<std::unique_ptr<ChunkMetadata>> LazyParquetChunkLoader::appendRowGroup
     parquet::ParquetFileReader* parquet_reader = file_reader->parquet_reader();
     auto parquet_column_descriptor =
         get_column_descriptor(file_reader, parquet_column_index);
+
+    initialization_timer_ms.stop();
+
+    validation_timer_ms.start();
     validate_equal_column_descriptor(first_parquet_column_descriptor,
                                      parquet_column_descriptor,
                                      first_file_path,
@@ -1888,17 +1909,22 @@ std::list<std::unique_ptr<ChunkMetadata>> LazyParquetChunkLoader::appendRowGroup
                                                  parquet_column_descriptor);
     set_definition_levels_for_zero_max_definition_level_case(parquet_column_descriptor,
                                                              def_levels);
+    validation_timer_ms.stop();
 
     int64_t values_read = 0;
     for (int row_group_index = row_group_interval.start_index;
          row_group_index <= row_group_interval.end_index;
          ++row_group_index) {
+      total_row_groups_read++;
+      parquet_read_timer_ms.start();
       auto group_reader = parquet_reader->RowGroup(row_group_index);
       std::shared_ptr<parquet::ColumnReader> col_reader =
           group_reader->Column(parquet_column_index);
+      parquet_read_timer_ms.stop();
 
       try {
         while (col_reader->HasNext()) {
+          parquet_read_timer_ms.start();
           int64_t levels_read =
               parquet::ScanAllValues(LazyParquetChunkLoader::batch_reader_num_elements,
                                      def_levels.data(),
@@ -1906,7 +1932,9 @@ std::list<std::unique_ptr<ChunkMetadata>> LazyParquetChunkLoader::appendRowGroup
                                      reinterpret_cast<uint8_t*>(values.data()),
                                      &values_read,
                                      col_reader.get());
+          parquet_read_timer_ms.stop();
 
+          encoding_timer_ms.start();
           if (rejected_row_indices) {  // error tracking is enabled
             encoder->appendDataTrackErrors(def_levels.data(),
                                            rep_levels.data(),
@@ -1928,18 +1956,31 @@ std::list<std::unique_ptr<ChunkMetadata>> LazyParquetChunkLoader::appendRowGroup
                                 levels_read,
                                 values.data());
           }
+          encoding_timer_ms.stop();
+
+          if (max_rows_to_read.has_value()) {
+            if (column_descriptor->columnType.is_array()) {
+              auto array_encoder =
+                  dynamic_cast<ParquetArrayDetectEncoder*>(encoder.get());
+              CHECK(array_encoder);
+              total_rows_read = array_encoder->getArraysCount();
+            } else {
+              // For scalar types it is safe to assume the number of levels read is equal
+              // to the number of rows read
+              total_rows_read += levels_read;
+            }
 
-          if (max_levels_read.has_value()) {
-            total_levels_read += levels_read;
-            if (total_levels_read >= max_levels_read.value()) {
+            if (total_rows_read >= max_rows_to_read.value()) {
               early_exit = true;
               break;
             }
           }
         }
+        encoding_timer_ms.start();
         if (auto array_encoder = dynamic_cast<ParquetArrayEncoder*>(encoder.get())) {
           array_encoder->finalizeRowGroup();
         }
+        encoding_timer_ms.stop();
       } catch (const std::exception& error) {
         // check for a specific error to detect a possible unexpected switch of data
         // source in order to respond with informative error message
@@ -1959,18 +2000,36 @@ std::list<std::unique_ptr<ChunkMetadata>> LazyParquetChunkLoader::appendRowGroup
             ", Parquet column: '" + col_reader->descr()->path()->ToDotString() +
             "', Parquet file: '" + file_path + "'");
       }
-      if (max_levels_read.has_value() && early_exit) {
+      if (max_rows_to_read.has_value() && early_exit) {
         break;
       }
     }
-    if (max_levels_read.has_value() && early_exit) {
+    if (max_rows_to_read.has_value() && early_exit) {
       break;
     }
   }
 
+  encoding_timer_ms.start();
   if (rejected_row_indices) {  // error tracking is enabled
     *rejected_row_indices = encoder->getRejectedRowIndices();
   }
+  encoding_timer_ms.stop();
+
+  summary_timer.stop();
+
+  VLOG(1) << "Appended " << total_row_groups_read
+          << " row groups to chunk. Column: " << column_descriptor->columnName
+          << ", Column id: " << column_descriptor->columnId << ", Parquet column: "
+          << first_parquet_column_descriptor->path()->ToDotString();
+  VLOG(1) << "Runtime summary:";
+  VLOG(1) << " Parquet chunk loading total time: " << summary_timer.elapsed() << "ms";
+  VLOG(1) << " Parquet encoder initialization time: " << initialization_timer_ms.elapsed()
+          << "ms";
+  VLOG(1) << " Parquet metadata validation time: " << validation_timer_ms.elapsed()
+          << "ms";
+  VLOG(1) << " Parquet column read time: " << parquet_read_timer_ms.elapsed() << "ms";
+  VLOG(1) << " Parquet data conversion time: " << encoding_timer_ms.elapsed() << "ms";
+
   return chunk_metadata;
 }
 
@@ -2472,6 +2531,12 @@ std::list<RowGroupMetadata> LazyParquetChunkLoader::metadataScan(
                                 schema,
                                 do_metadata_stats_validation);
 
+  // Iterate asynchronously over any paths beyond the first.
+  auto table_ptr = schema.getForeignTable();
+  CHECK(table_ptr);
+  auto num_threads = foreign_storage::get_num_threads(*table_ptr);
+  VLOG(1) << "Metadata scan using " << num_threads << " threads";
+
   const bool geo_validate_geometry =
       foreign_table_->getOptionAsBool(ForeignTable::GEO_VALIDATE_GEOMETRY_KEY);
   auto encoder_map = populate_encoder_map_for_metadata_scan(column_interval,
@@ -2480,8 +2545,10 @@ std::list<RowGroupMetadata> LazyParquetChunkLoader::metadataScan(
                                                             do_metadata_stats_validation,
                                                             geo_validate_geometry);
   const auto num_row_groups = get_parquet_table_size(first_reader).first;
+  VLOG(1) << "Starting metadata scan of path " << first_path;
   auto row_group_metadata = metadata_scan_rowgroup_interval(
       encoder_map, {first_path, 0, num_row_groups - 1}, first_reader, schema);
+  VLOG(1) << "Completed metadata scan of path " << first_path;
 
   // We want each (filepath->FileReader) pair in the cache to be initialized before we
   // multithread so that we are not adding keys in a concurrent environment, so we add
@@ -2495,10 +2562,6 @@ std::list<RowGroupMetadata> LazyParquetChunkLoader::metadataScan(
     cache_subset.emplace_back(*path_it);
   }
 
-  // Iterate asyncronously over any paths beyond the first.
-  auto table_ptr = schema.getForeignTable();
-  CHECK(table_ptr);
-  auto num_threads = foreign_storage::get_num_threads(*table_ptr);
   auto paths_per_thread = partition_for_threads(cache_subset, num_threads);
   std::vector<std::future<std::pair<std::list<RowGroupMetadata>, MaxRowGroupSizeStats>>>
       futures;
@@ -2507,10 +2570,21 @@ std::list<RowGroupMetadata> LazyParquetChunkLoader::metadataScan(
         std::launch::async,
         [&](const auto& paths, const auto& file_reader_cache)
             -> std::pair<std::list<RowGroupMetadata>, MaxRowGroupSizeStats> {
+          Timer<> summary_timer;
+          Timer<> get_or_insert_reader_timer_ms;
+          Timer<> validation_timer_ms;
+          Timer<> metadata_scan_timer;
+
+          summary_timer.start();
+
           std::list<RowGroupMetadata> reduced_metadata;
           MaxRowGroupSizeStats max_row_group_stats{0, 0};
           for (const auto& path : paths.get()) {
+            get_or_insert_reader_timer_ms.start();
             auto reader = file_reader_cache.get().getOrInsert(path, file_system_);
+            get_or_insert_reader_timer_ms.stop();
+
+            validation_timer_ms.start();
             validate_equal_schema(first_reader, reader, first_path, path);
             auto local_max_row_group_stats =
                 validate_parquet_metadata(reader->parquet_reader()->metadata(),
@@ -2521,12 +2595,33 @@ std::list<RowGroupMetadata> LazyParquetChunkLoader::metadataScan(
                 max_row_group_stats.max_row_group_size) {
               max_row_group_stats = local_max_row_group_stats;
             }
+            validation_timer_ms.stop();
+
+            VLOG(1) << "Starting metadata scan of path " << path;
+
+            metadata_scan_timer.start();
             const auto num_row_groups = get_parquet_table_size(reader).first;
             const auto interval = RowGroupInterval{path, 0, num_row_groups - 1};
             reduced_metadata.splice(
                 reduced_metadata.end(),
                 metadata_scan_rowgroup_interval(encoder_map, interval, reader, schema));
+            metadata_scan_timer.stop();
+
+            VLOG(1) << "Completed metadata scan of path " << path;
           }
+
+          summary_timer.stop();
+
+          VLOG(1) << "Runtime summary:";
+          VLOG(1) << " Parquet metadata scan total time: " << summary_timer.elapsed()
+                  << "ms";
+          VLOG(1) << " Parquet file reader opening time: "
+                  << get_or_insert_reader_timer_ms.elapsed() << "ms";
+          VLOG(1) << " Parquet metadata validation time: "
+                  << validation_timer_ms.elapsed() << "ms";
+          VLOG(1) << " Parquet metadata processing time: "
+                  << validation_timer_ms.elapsed() << "ms";
+
           return {reduced_metadata, max_row_group_stats};
         },
         std::ref(path_group),
diff --git a/DataMgr/ForeignStorage/ParquetArrayDetectEncoder.h b/DataMgr/ForeignStorage/ParquetArrayDetectEncoder.h
index 1c3db36671..4bccd954e5 100644
--- a/DataMgr/ForeignStorage/ParquetArrayDetectEncoder.h
+++ b/DataMgr/ForeignStorage/ParquetArrayDetectEncoder.h
@@ -48,6 +48,8 @@ class ParquetArrayDetectEncoder : public ParquetArrayEncoder {
     updateMetadataForAppendedArrayItem(encoded_index);
   }
 
+  size_t getArraysCount() const { return detect_buffer_->getStrings().size(); }
+
  protected:
   void encodeAllValues(const int8_t* values, const int64_t values_read) override {
     if (!is_string_array_) {
diff --git a/DataMgr/ForeignStorage/ParquetDataWrapper.cpp b/DataMgr/ForeignStorage/ParquetDataWrapper.cpp
index 600cfc8951..7d8581a2a0 100644
--- a/DataMgr/ForeignStorage/ParquetDataWrapper.cpp
+++ b/DataMgr/ForeignStorage/ParquetDataWrapper.cpp
@@ -634,16 +634,27 @@ void ParquetDataWrapper::populateChunkBuffers(const ChunkToBufferMap& required_b
         chunk_key[CHUNK_KEY_FRAGMENT_IDX]);
   }
 
+  const logger::ThreadLocalIds parent_thread_local_ids = logger::thread_local_ids();
+
   std::function<void(const std::set<ForeignStorageMgr::ParallelismHint>&)> lambda =
       [&, this](const std::set<ForeignStorageMgr::ParallelismHint>& hint_set) {
+        // Enable debug timers
+        logger::LocalIdsScopeGuard lisg = parent_thread_local_ids.setNewThreadId();
+        DEBUG_TIMER_NEW_THREAD(parent_thread_local_ids.thread_id_);
+
         for (const auto& [col_id, frag_id] : hint_set) {
           loadBuffersUsingLazyParquetChunkLoader(
               col_id, frag_id, buffers_to_load, delete_buffer);
+          VLOG(1) << "Loaded key " << db_id_ << "," << foreign_table_->tableId << ","
+                  << col_id << "," << frag_id;
         }
       };
 
   CHECK(foreign_table_);
   auto num_threads = foreign_storage::get_num_threads(*foreign_table_);
+
+  VLOG(1) << "Populating chunk from parquet source using " + std::to_string(num_threads) +
+                 " threads.";
   auto futures = create_futures_for_workers(col_frag_hints, num_threads, lambda);
 
   // We wait on all futures, then call get because we want all threads to have finished
diff --git a/HeavyDB.cpp b/HeavyDB.cpp
index 89c00d8a92..d8db98999f 100644
--- a/HeavyDB.cpp
+++ b/HeavyDB.cpp
@@ -40,6 +40,7 @@
 #include "Shared/SystemParameters.h"
 #include "Shared/file_delete.h"
 #include "Shared/heavyai_shared_mutex.h"
+#include "Shared/misc.h"
 #include "Shared/scope.h"
 
 #include <boost/algorithm/string.hpp>
@@ -53,6 +54,10 @@
 #include <tbb/global_control.h>
 #endif
 
+#ifdef __linux__
+#include <unistd.h>
+#endif
+
 #include <csignal>
 #include <cstdlib>
 #include <sstream>
@@ -635,6 +640,14 @@ int startHeavyDBServer(CommandLineOptions& prog_config_opts,
   }
 }
 
+void log_startup_info() {
+#ifdef __linux__
+  VLOG(1) << "sysconf(_SC_PAGE_SIZE): " << sysconf(_SC_PAGE_SIZE);
+  VLOG(1) << "/proc/buddyinfo: " << shared::FileContentsEscaper{"/proc/buddyinfo"};
+  VLOG(1) << "/proc/meminfo: " << shared::FileContentsEscaper{"/proc/meminfo"};
+#endif
+}
+
 int main(int argc, char** argv) {
   bool has_clust_topo = false;
 
@@ -649,6 +662,7 @@ int main(int argc, char** argv) {
     if (!has_clust_topo) {
       prog_config_opts.validate_base_path();
       prog_config_opts.validate();
+      log_startup_info();
       return (startHeavyDBServer(prog_config_opts));
     }
   } catch (std::runtime_error& e) {
diff --git a/HeavyIQ/CMakeLists.txt b/HeavyIQ/CMakeLists.txt
index e5f2396675..592119c5e3 100644
--- a/HeavyIQ/CMakeLists.txt
+++ b/HeavyIQ/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(HEAVYIQ_BUILD_ID "main" CACHE STRING "HeavyIQ Build ID")
+set(HEAVYIQ_BUILD_ID "master" CACHE STRING "HeavyIQ Build ID")
 set(HEAVYIQ_URL "http://builds.mapd.com/heavyiq/heavyiq-${HEAVYIQ_BUILD_ID}/dist.tgz")
 
 include(ExternalProject)
diff --git a/ImportExport/CopyParams.h b/ImportExport/CopyParams.h
index 5dd925259a..c854c0759b 100644
--- a/ImportExport/CopyParams.h
+++ b/ImportExport/CopyParams.h
@@ -92,6 +92,7 @@ struct CopyParams {
   bool raster_point_compute_angle;
   std::string raster_import_dimensions;
   std::string add_metadata_columns;
+  bool raster_drop_if_all_null;
   // odbc parameters
   std::string sql_select;
   std::string sql_order_by;
@@ -138,7 +139,8 @@ struct CopyParams {
       , raster_point_type(RasterPointType::kAuto)
       , raster_scanlines_per_thread(32)
       , raster_point_transform(RasterPointTransform::kAuto)
-      , raster_point_compute_angle{false} {}
+      , raster_point_compute_angle{false}
+      , raster_drop_if_all_null{false} {}
 
   CopyParams(char d, const std::string& n, char l, size_t b, size_t retries, size_t wait)
       : delimiter(d)
@@ -172,7 +174,8 @@ struct CopyParams {
       , raster_point_type(RasterPointType::kAuto)
       , raster_scanlines_per_thread(32)
       , raster_point_transform(RasterPointTransform::kAuto)
-      , raster_point_compute_angle{false} {}
+      , raster_point_compute_angle{false}
+      , raster_drop_if_all_null{false} {}
 };
 
 }  // namespace import_export
diff --git a/ImportExport/Importer.cpp b/ImportExport/Importer.cpp
index d3fdf8f973..fdbe13d40a 100644
--- a/ImportExport/Importer.cpp
+++ b/ImportExport/Importer.cpp
@@ -2385,7 +2385,7 @@ static ImportStatus import_thread_shapefile(
                      check_session_interrupted(query_session, executor))) {
           thread_import_status.load_failed = true;
           thread_import_status.load_msg = "Table load was cancelled via Query Interrupt";
-          throw QueryExecutionError(Executor::ERR_INTERRUPTED);
+          throw QueryExecutionError(ErrorCode::INTERRUPTED);
         }
 
         uint32_t field_column_count{0u};
@@ -2624,7 +2624,7 @@ static ImportStatus import_thread_shapefile(
         }
         thread_import_status.rows_completed++;
       } catch (QueryExecutionError& e) {
-        if (e.getErrorCode() == Executor::ERR_INTERRUPTED) {
+        if (e.hasErrorCode(ErrorCode::INTERRUPTED)) {
           throw e;
         }
       } catch (ColumnNotGeoError& e) {
@@ -5844,6 +5844,25 @@ ImportStatus Importer::importGDALRaster(
 
     bool read_block_failed = false;
 
+    // prepare to store which band values in which rows are null
+    boost::dynamic_bitset<> row_band_nulls;
+    if (copy_params.raster_drop_if_all_null) {
+      row_band_nulls.resize(num_elems * num_bands);
+    }
+
+    auto set_row_band_null = [&](const int row, const uint32_t band) {
+      auto const bit_index = (row * num_bands) + band;
+      row_band_nulls.set(bit_index);
+    };
+    auto all_row_bands_null = [&](const int row) -> bool {
+      auto const first_bit_index = row * num_bands;
+      bool all_null = true;
+      for (auto i = first_bit_index; i < first_bit_index + num_bands; i++) {
+        all_null = all_null && row_band_nulls.test(i);
+      }
+      return all_null;
+    };
+
     // for each band/column
     for (uint32_t band_idx = 0; band_idx < num_bands; band_idx++) {
       // the corresponding column
@@ -5884,6 +5903,9 @@ ImportStatus Importer::importGDALRaster(
             if (null_value_valid && value == static_cast<int16_t>(null_value)) {
               td.is_null = true;
               td.val.int_val = NULL_SMALLINT;
+              if (copy_params.raster_drop_if_all_null) {
+                set_row_band_null(idx, band_idx);
+              }
             } else {
               td.is_null = false;
               td.val.int_val = static_cast<int64_t>(value);
@@ -5899,6 +5921,9 @@ ImportStatus Importer::importGDALRaster(
             if (null_value_valid && value == static_cast<int32_t>(null_value)) {
               td.is_null = true;
               td.val.int_val = NULL_INT;
+              if (copy_params.raster_drop_if_all_null) {
+                set_row_band_null(idx, band_idx);
+              }
             } else {
               td.is_null = false;
               td.val.int_val = static_cast<int64_t>(value);
@@ -5914,6 +5939,9 @@ ImportStatus Importer::importGDALRaster(
             if (null_value_valid && value == static_cast<uint32_t>(null_value)) {
               td.is_null = true;
               td.val.int_val = NULL_INT;
+              if (copy_params.raster_drop_if_all_null) {
+                set_row_band_null(idx, band_idx);
+              }
             } else {
               td.is_null = false;
               td.val.int_val = static_cast<int64_t>(value);
@@ -5928,6 +5956,9 @@ ImportStatus Importer::importGDALRaster(
             if (null_value_valid && value == static_cast<float>(null_value)) {
               td.is_null = true;
               td.val.real_val = NULL_FLOAT;
+              if (copy_params.raster_drop_if_all_null) {
+                set_row_band_null(idx, band_idx);
+              }
             } else {
               td.is_null = false;
               td.val.real_val = static_cast<double>(value);
@@ -5942,6 +5973,9 @@ ImportStatus Importer::importGDALRaster(
             if (null_value_valid && value == null_value) {
               td.is_null = true;
               td.val.real_val = NULL_DOUBLE;
+              if (copy_params.raster_drop_if_all_null) {
+                set_row_band_null(idx, band_idx);
+              }
             } else {
               td.is_null = false;
               td.val.real_val = value;
@@ -5964,7 +5998,9 @@ ImportStatus Importer::importGDALRaster(
       for (auto& col_buffer : import_buffers) {
         col_buffer->clear();
       }
-      thread_import_status.rows_rejected += num_elems;
+      thread_import_status.rows_estimated = 0;
+      thread_import_status.rows_completed = 0;
+      thread_import_status.rows_rejected = num_elems;
     } else {
       // metadata columns?
       for (auto const& mci : metadata_column_infos) {
@@ -5975,8 +6011,50 @@ ImportStatus Importer::importGDALRaster(
         }
         col_idx++;
       }
-      thread_import_status.rows_estimated = num_elems;
-      thread_import_status.rows_completed = num_elems;
+
+      // drop rows where all band columns are null?
+      int num_dropped_as_all_null = 0;
+      if (copy_params.raster_drop_if_all_null) {
+        // capture rows where ALL the band values (only) were NULL
+        // count rows first (implies two passes on the bitset but
+        // still quicker than building the row set if not needed,
+        // in the case where ALL rows are to be dropped)
+        for (int row = 0; row < num_elems; row++) {
+          if (all_row_bands_null(row)) {
+            num_dropped_as_all_null++;
+          }
+        }
+        // delete those rows from ALL column buffers (including coords and metadata)
+        if (num_dropped_as_all_null == num_elems) {
+          // all rows need dropping, just clear (fast)
+          for (auto& col_buffer : import_buffers) {
+            col_buffer->clear();
+          }
+        } else if (num_dropped_as_all_null > 0) {
+          // drop "bad" rows selectively (slower)
+          // build row set to drop
+          BadRowsTracker bad_rows_tracker;
+          for (int row = 0; row < num_elems; row++) {
+            if (all_row_bands_null(row)) {
+              bad_rows_tracker.rows.emplace(static_cast<int64_t>(row));
+            }
+          }
+          // then delete rows
+          for (auto& col_buffer : import_buffers) {
+            auto const* cd = col_buffer->getColumnDesc();
+            CHECK(cd);
+            auto const col_type = cd->columnType.get_type();
+            col_buffer->del_values(col_type, &bad_rows_tracker);
+          }
+        }
+      }
+
+      // final count
+      CHECK_LE(num_dropped_as_all_null, num_elems);
+      auto const actual_num_elems = num_elems - num_dropped_as_all_null;
+      thread_import_status.rows_estimated = actual_num_elems;
+      thread_import_status.rows_completed = actual_num_elems;
+      thread_import_status.rows_rejected = 0;
     }
 
     // done
@@ -6005,7 +6083,6 @@ ImportStatus Importer::importGDALRaster(
     VLOG(1) << "Raster Importer: scanlines_in_block: " << scanlines_in_block
             << ", block_max_scanlines_per_thread:  " << block_max_scanlines_per_thread;
 
-    std::vector<size_t> rows_per_thread;
     auto block_wall_timer = timer_start();
     // run max_threads scanlines at once
     for (size_t thread_id = 0; thread_id < max_threads; thread_id++) {
@@ -6013,7 +6090,6 @@ ImportStatus Importer::importGDALRaster(
       if (y_start < band_size_y) {
         const int y_end = std::min(y_start + block_max_scanlines_per_thread, band_size_y);
         if (y_start < y_end) {
-          rows_per_thread.emplace_back((y_end - y_start) * band_size_x);
           futures.emplace_back(
               std::async(std::launch::async, import_rows, thread_id, y_start, y_end));
         }
@@ -6036,8 +6112,8 @@ ImportStatus Importer::importGDALRaster(
       // fashion so we can simultaneously read the next batch of data
       auto thread_load_timer = timer_start();
       // only try to load this thread's data if valid
-      if (import_status.rows_rejected == 0) {
-        load(import_buffers_vec[thread_idx], rows_per_thread[thread_idx], session_info);
+      if (import_status.rows_completed > 0) {
+        load(import_buffers_vec[thread_idx], import_status.rows_completed, session_info);
       }
       load_s += TIMER_STOP(thread_load_timer);
       ++thread_idx;
@@ -6065,7 +6141,7 @@ ImportStatus Importer::importGDALRaster(
     if (UNLIKELY(check_session_interrupted(query_session, executor.get()))) {
       import_status_.load_failed = true;
       import_status_.load_msg = "Raster Import interrupted";
-      throw QueryExecutionError(Executor::ERR_INTERRUPTED);
+      throw QueryExecutionError(ErrorCode::INTERRUPTED);
     }
 
     // hit max_reject?
diff --git a/LockMgr/LockMgr.cpp b/LockMgr/LockMgr.cpp
index 30313d8053..0dfa7f6b70 100644
--- a/LockMgr/LockMgr.cpp
+++ b/LockMgr/LockMgr.cpp
@@ -14,16 +14,9 @@
  * limitations under the License.
  */
 
-#include "LockMgr/LockMgrImpl.h"
-
-#include <string>
-
+#include "LockMgr/LockMgr.h"
 #include "Catalog/Catalog.h"
-#include "LockMgr/LegacyLockMgr.h"
-#include "QueryEngine/JsonAccessors.h"
-#include "QueryRunner/QueryRunner.h"
-#include "Shared/types.h"
-#include "gen-cpp/CalciteServer.h"
+#include "LockMgr/LockMgrImpl.h"
 
 namespace lockmgr {
 
@@ -42,4 +35,269 @@ ChunkKey chunk_key_for_table(const Catalog_Namespace::Catalog& cat,
 
 }  // namespace helpers
 
+void MutexTracker::lock() {
+  ref_count_.fetch_add(1u);
+  if (!g_multi_instance) {
+    mutex_.lock();
+  } else {
+    dmutex_->lock();
+  }
+}
+
+bool MutexTracker::try_lock() {
+  bool gotlock{false};
+  if (!g_multi_instance) {
+    gotlock = mutex_.try_lock();
+  } else {
+    gotlock = dmutex_->try_lock();
+  }
+  if (gotlock) {
+    ref_count_.fetch_add(1u);
+  }
+  return gotlock;
+}
+
+void MutexTracker::unlock() {
+  if (!g_multi_instance) {
+    mutex_.unlock();
+  } else {
+    dmutex_->unlock();
+  }
+  ref_count_.fetch_sub(1u);
+}
+
+void MutexTracker::lock_shared() {
+  ref_count_.fetch_add(1u);
+  if (!g_multi_instance) {
+    mutex_.lock_shared();
+  } else {
+    dmutex_->lock_shared();
+  }
+}
+
+bool MutexTracker::try_lock_shared() {
+  bool gotlock{false};
+  if (!g_multi_instance) {
+    gotlock = mutex_.try_lock_shared();
+  } else {
+    gotlock = dmutex_->try_lock_shared();
+  }
+  if (gotlock) {
+    ref_count_.fetch_add(1u);
+  }
+  return gotlock;
+}
+
+void MutexTracker::unlock_shared() {
+  if (!g_multi_instance) {
+    mutex_.unlock_shared();
+  } else {
+    dmutex_->unlock_shared();
+  }
+  ref_count_.fetch_sub(1u);
+}
+
+template <class T>
+T& instance() {
+  static T mgr;
+  return mgr;
+}
+
+template <class T>
+MutexTracker* TableLockMgrImpl<T>::getTableMutex(const ChunkKey& table_key) {
+  std::lock_guard<std::mutex> access_map_lock(map_mutex_);
+  auto mutex_it = table_mutex_map_.find(table_key);
+  if (mutex_it != table_mutex_map_.end()) {
+    return mutex_it->second.get();
+  }
+
+  // NOTE(sy): Only used by --multi-instance clusters.
+  std::unique_ptr<heavyai::DistributedSharedMutex> dmutex =
+      getClusterTableMutex(table_key);
+
+  return table_mutex_map_
+      .emplace(table_key, std::make_unique<MutexTracker>(std::move(dmutex)))
+      .first->second.get();
+}
+
+template <class T>
+std::set<ChunkKey> TableLockMgrImpl<T>::getLockedTables() const {
+  std::set<ChunkKey> ret;
+  std::lock_guard<std::mutex> access_map_lock(map_mutex_);
+  for (const auto& kv : table_mutex_map_) {
+    if (kv.second->isAcquired()) {
+      ret.insert(kv.first);
+    }
+  }
+
+  return ret;
+}
+
+template <class T>
+WriteLock TableLockMgrImpl<T>::getWriteLockForTable(const Catalog_Namespace::Catalog& cat,
+                                                    const std::string& table_name) {
+  auto lock = WriteLock(getMutexTracker(cat, table_name));
+  // Ensure table still exists after lock is acquired.
+  validateExistingTable(cat, table_name);
+  return lock;
+}
+
+template <class T>
+WriteLock TableLockMgrImpl<T>::getWriteLockForTable(const ChunkKey& table_key) {
+  auto& table_lock_mgr = T::instance();
+  return WriteLock(table_lock_mgr.getTableMutex(table_key));
+}
+
+template <class T>
+ReadLock TableLockMgrImpl<T>::getReadLockForTable(Catalog_Namespace::Catalog& cat,
+                                                  const std::string& table_name) {
+  auto lock = ReadLock(getMutexTracker(cat, table_name));
+  // Ensure table still exists after lock is acquired.
+  validateAndGetExistingTableId(cat, table_name);
+  return lock;
+}
+
+template <class T>
+ReadLock TableLockMgrImpl<T>::getReadLockForTable(const ChunkKey& table_key) {
+  auto& table_lock_mgr = T::instance();
+  return ReadLock(table_lock_mgr.getTableMutex(table_key));
+}
+
+template <class T>
+std::unique_ptr<heavyai::DistributedSharedMutex>
+TableLockMgrImpl<T>::getClusterTableMutex(const ChunkKey& table_key) const {
+  std::unique_ptr<heavyai::DistributedSharedMutex> table_mutex;
+
+  std::string table_key_as_text;
+  for (auto n : table_key) {
+    table_key_as_text += (!table_key_as_text.empty() ? "_" : "") + std::to_string(n);
+  }
+
+  // A callback used for syncing with most of the changed Catalog metadata, in-general,
+  // such as the list of tables that exist, dashboards, etc. This is accomplished by
+  // read locking, and immediately unlocking, dcatalogMutex_, so
+  // cat->reloadCatalogMetadataUnlocked() will be called.
+  auto cb_reload_catalog_metadata = [table_key](bool write) {
+    if constexpr (T::kind == "insert") {
+      CHECK(write);  // The insert lock is for writing, never for reading.
+    }
+    auto cat =
+        Catalog_Namespace::SysCatalog::instance().getCatalog(table_key[CHUNK_KEY_DB_IDX]);
+    CHECK(cat);
+    heavyai::shared_lock<heavyai::DistributedSharedMutex> dread_lock(
+        *cat->dcatalogMutex_);
+  };
+
+  if constexpr (T::kind == "schema") {
+    // A callback used for reloading the Catalog schema for the one table being locked.
+    auto cb_reload_table_metadata = [table_key, table_key_as_text](size_t version) {
+      VLOG(2) << "reloading table metadata for: table_" << table_key_as_text;
+      CHECK_EQ(table_key.size(), 2U);
+      auto cat = Catalog_Namespace::SysCatalog::instance().getCatalog(
+          table_key[CHUNK_KEY_DB_IDX]);
+      CHECK(cat);
+      heavyai::shared_lock<heavyai::DistributedSharedMutex> dread_lock(
+          *cat->dcatalogMutex_);
+      cat->reloadTableMetadataUnlocked(table_key[CHUNK_KEY_TABLE_IDX]);
+    };
+
+    // Create the table mutex.
+    heavyai::DistributedSharedMutex::Callbacks cbs{
+        cb_reload_catalog_metadata,  // pre_lock_callback
+        cb_reload_table_metadata     // reload_cache_callback
+    };
+    auto schema_lockfile{
+        std::filesystem::path(g_base_path) / shared::kLockfilesDirectoryName /
+        shared::kCatalogDirectoryName /
+        ("table_" + table_key_as_text + "." + T::kind.data() + ".lockfile")};
+    table_mutex =
+        std::make_unique<heavyai::DistributedSharedMutex>(schema_lockfile.string(), cbs);
+  } else if constexpr (T::kind == "data" || T::kind == "insert") {
+    // A callback used for reloading the DataMgr data for the one table being locked.
+    auto cb_reload_table_data = [table_key, table_key_as_text](size_t version) {
+      VLOG(2) << "invalidating table caches for new version " << version << " of: table_"
+              << table_key_as_text;
+      CHECK_EQ(table_key.size(), 2U);
+      auto cat = Catalog_Namespace::SysCatalog::instance().getCatalog(
+          table_key[CHUNK_KEY_DB_IDX]);
+      CHECK(cat);
+      cat->invalidateCachesForTable(table_key[CHUNK_KEY_TABLE_IDX]);
+    };
+
+    // Create the rows mutex.
+    auto rows_lockfile{std::filesystem::path(g_base_path) /
+                       shared::kLockfilesDirectoryName / shared::kDataDirectoryName /
+                       ("table_" + table_key_as_text + ".rows.lockfile")};
+    std::shared_ptr<heavyai::DistributedSharedMutex> rows_mutex =
+        std::make_shared<heavyai::DistributedSharedMutex>(
+            rows_lockfile.string(),
+            cb_reload_table_data  // reload_cache_callback
+        );
+
+    // A callback used for syncing with outside changes to this table's row data.
+    auto cb_reload_row_data = [table_key, rows_mutex](bool write) {
+      heavyai::shared_lock<heavyai::DistributedSharedMutex> rows_read_lock(*rows_mutex);
+    };
+
+    // A callback to notify other nodes about our changes to this table's row data.
+    auto cb_notify_about_row_data = [table_key, rows_mutex](bool write) {
+      if (write) {
+        heavyai::unique_lock<heavyai::DistributedSharedMutex> rows_write_lock(
+            *rows_mutex);
+      }
+    };
+
+    // Create the table mutex.
+    heavyai::DistributedSharedMutex::Callbacks cbs{
+        cb_reload_catalog_metadata,  // pre_lock_callback
+        {},
+        cb_reload_row_data,       // post_lock_callback
+        cb_notify_about_row_data  // pre_unlock_callback
+    };
+    auto table_lockfile{
+        std::filesystem::path(g_base_path) / shared::kLockfilesDirectoryName /
+        shared::kDataDirectoryName /
+        ("table_" + table_key_as_text + "." + T::kind.data() + ".lockfile")};
+    table_mutex =
+        std::make_unique<heavyai::DistributedSharedMutex>(table_lockfile.string(), cbs);
+  } else {
+    UNREACHABLE() << "unexpected lockmgr kind: " << T::kind;
+  }
+
+  return table_mutex;
+}
+
+template <class T>
+MutexTracker* TableLockMgrImpl<T>::getMutexTracker(
+    const Catalog_Namespace::Catalog& catalog,
+    const std::string& table_name) {
+  ChunkKey chunk_key{catalog.getDatabaseId(),
+                     validateAndGetExistingTableId(catalog, table_name)};
+  auto& table_lock_mgr = T::instance();
+  MutexTracker* tracker = table_lock_mgr.getTableMutex(chunk_key);
+  CHECK(tracker);
+  return tracker;
+}
+
+template <class T>
+void TableLockMgrImpl<T>::validateExistingTable(const Catalog_Namespace::Catalog& catalog,
+                                                const std::string& table_name) {
+  validateAndGetExistingTableId(catalog, table_name);
+}
+
+template <class T>
+int32_t TableLockMgrImpl<T>::validateAndGetExistingTableId(
+    const Catalog_Namespace::Catalog& catalog,
+    const std::string& table_name) {
+  auto table_id = catalog.getTableId(table_name);
+  if (!table_id.has_value()) {
+    throw Catalog_Namespace::TableNotFoundException(table_name, catalog.name());
+  }
+  return table_id.value();
+}
+
+template class TableLockMgrImpl<TableSchemaLockMgr>;
+template class TableLockMgrImpl<TableDataLockMgr>;
+template class TableLockMgrImpl<InsertDataLockMgr>;
+
 }  // namespace lockmgr
diff --git a/LockMgr/LockMgrImpl.h b/LockMgr/LockMgrImpl.h
index e1db17318a..dd2b328b5b 100644
--- a/LockMgr/LockMgrImpl.h
+++ b/LockMgr/LockMgrImpl.h
@@ -23,9 +23,7 @@
 #include <type_traits>
 
 #include "Catalog/Catalog.h"
-#include "DataMgr/FileMgr/GlobalFileMgr.h"
 #include "OSDependent/heavyai_locks.h"
-#include "QueryEngine/ExternalCacheInvalidators.h"
 #include "Shared/heavyai_shared_mutex.h"
 #include "Shared/types.h"
 
@@ -42,63 +40,13 @@ class MutexTracker : public heavyai::SharedMutexInterface {
   MutexTracker(std::unique_ptr<heavyai::DistributedSharedMutex> dmutex)
       : ref_count_(0u), dmutex_(std::move(dmutex)) {}
 
-  virtual void lock() {
-    ref_count_.fetch_add(1u);
-    if (!g_multi_instance) {
-      mutex_.lock();
-    } else {
-      dmutex_->lock();
-    }
-  }
-  virtual bool try_lock() {
-    bool gotlock{false};
-    if (!g_multi_instance) {
-      gotlock = mutex_.try_lock();
-    } else {
-      gotlock = dmutex_->try_lock();
-    }
-    if (gotlock) {
-      ref_count_.fetch_add(1u);
-    }
-    return gotlock;
-  }
-  virtual void unlock() {
-    if (!g_multi_instance) {
-      mutex_.unlock();
-    } else {
-      dmutex_->unlock();
-    }
-    ref_count_.fetch_sub(1u);
-  }
+  virtual void lock();
+  virtual bool try_lock();
+  virtual void unlock();
 
-  virtual void lock_shared() {
-    ref_count_.fetch_add(1u);
-    if (!g_multi_instance) {
-      mutex_.lock_shared();
-    } else {
-      dmutex_->lock_shared();
-    }
-  }
-  virtual bool try_lock_shared() {
-    bool gotlock{false};
-    if (!g_multi_instance) {
-      gotlock = mutex_.try_lock_shared();
-    } else {
-      gotlock = dmutex_->try_lock_shared();
-    }
-    if (gotlock) {
-      ref_count_.fetch_add(1u);
-    }
-    return gotlock;
-  }
-  virtual void unlock_shared() {
-    if (!g_multi_instance) {
-      mutex_.unlock_shared();
-    } else {
-      dmutex_->unlock_shared();
-    }
-    ref_count_.fetch_sub(1u);
-  }
+  virtual void lock_shared();
+  virtual bool try_lock_shared();
+  virtual void unlock_shared();
 
   virtual bool isAcquired() const { return ref_count_.load() > 0; }
 
@@ -188,196 +136,42 @@ class TableLockMgrImpl {
                 std::is_same_v<T, InsertDataLockMgr>);
 
  public:
-  static T& instance() {
-    static T mgr;
-    return mgr;
-  }
-  virtual ~TableLockMgrImpl() = default;
-
-  virtual MutexTracker* getTableMutex(const ChunkKey table_key) {
-    std::lock_guard<std::mutex> access_map_lock(map_mutex_);
-    auto mutex_it = table_mutex_map_.find(table_key);
-    if (mutex_it != table_mutex_map_.end()) {
-      return mutex_it->second.get();
-    }
-
-    // NOTE(sy): Only used by --multi-instance clusters.
-    std::unique_ptr<heavyai::DistributedSharedMutex> dmutex =
-        getClusterTableMutex(table_key);
+  static T& instance();
 
-    return table_mutex_map_
-        .emplace(table_key, std::make_unique<MutexTracker>(std::move(dmutex)))
-        .first->second.get();
-  }
+  virtual ~TableLockMgrImpl() = default;
 
-  std::set<ChunkKey> getLockedTables() const {
-    std::set<ChunkKey> ret;
-    std::lock_guard<std::mutex> access_map_lock(map_mutex_);
-    for (const auto& kv : table_mutex_map_) {
-      if (kv.second->isAcquired()) {
-        ret.insert(kv.first);
-      }
-    }
+  virtual MutexTracker* getTableMutex(const ChunkKey& table_key);
 
-    return ret;
-  }
+  std::set<ChunkKey> getLockedTables() const;
 
   static WriteLock getWriteLockForTable(const Catalog_Namespace::Catalog& cat,
-                                        const std::string& table_name) {
-    auto lock = WriteLock(getMutexTracker(cat, table_name));
-    // Ensure table still exists after lock is acquired.
-    validateExistingTable(cat, table_name);
-    return lock;
-  }
+                                        const std::string& table_name);
 
-  static WriteLock getWriteLockForTable(const ChunkKey table_key) {
-    auto& table_lock_mgr = T::instance();
-    return WriteLock(table_lock_mgr.getTableMutex(table_key));
-  }
+  static WriteLock getWriteLockForTable(const ChunkKey& table_key);
 
   static ReadLock getReadLockForTable(Catalog_Namespace::Catalog& cat,
-                                      const std::string& table_name) {
-    auto lock = ReadLock(getMutexTracker(cat, table_name));
-    // Ensure table still exists after lock is acquired.
-    validateAndGetExistingTableId(cat, table_name);
-    return lock;
-  }
+                                      const std::string& table_name);
 
-  static ReadLock getReadLockForTable(const ChunkKey table_key) {
-    auto& table_lock_mgr = T::instance();
-    return ReadLock(table_lock_mgr.getTableMutex(table_key));
-  }
+  static ReadLock getReadLockForTable(const ChunkKey& table_key);
 
  protected:
   TableLockMgrImpl() {}
 
   virtual std::unique_ptr<heavyai::DistributedSharedMutex> getClusterTableMutex(
-      const ChunkKey table_key) {
-    std::unique_ptr<heavyai::DistributedSharedMutex> table_mutex;
-
-    std::string table_key_as_text;
-    for (auto n : table_key) {
-      table_key_as_text += (!table_key_as_text.empty() ? "_" : "") + std::to_string(n);
-    }
-
-    // A callback used for syncing with most of the changed Catalog metadata, in-general,
-    // such as the list of tables that exist, dashboards, etc. This is accomplished by
-    // read locking, and immediately unlocking, dcatalogMutex_, so
-    // cat->reloadCatalogMetadataUnlocked() will be called.
-    auto cb_reload_catalog_metadata = [table_key](bool write) {
-      if constexpr (T::kind == "insert") {
-        CHECK(write);  // The insert lock is for writing, never for reading.
-      }
-      auto cat = Catalog_Namespace::SysCatalog::instance().getCatalog(
-          table_key[CHUNK_KEY_DB_IDX]);
-      CHECK(cat);
-      heavyai::shared_lock<heavyai::DistributedSharedMutex> dread_lock(
-          *cat->dcatalogMutex_);
-    };
-
-    if constexpr (T::kind == "schema") {
-      // A callback used for reloading the Catalog schema for the one table being locked.
-      auto cb_reload_table_metadata = [table_key, table_key_as_text](size_t version) {
-        VLOG(2) << "reloading table metadata for: table_" << table_key_as_text;
-        CHECK_EQ(table_key.size(), 2U);
-        auto cat = Catalog_Namespace::SysCatalog::instance().getCatalog(
-            table_key[CHUNK_KEY_DB_IDX]);
-        CHECK(cat);
-        heavyai::shared_lock<heavyai::DistributedSharedMutex> dread_lock(
-            *cat->dcatalogMutex_);
-        cat->reloadTableMetadataUnlocked(table_key[CHUNK_KEY_TABLE_IDX]);
-      };
-
-      // Create the table mutex.
-      heavyai::DistributedSharedMutex::Callbacks cbs{
-          /*pre_lock_callback=*/cb_reload_catalog_metadata,
-          /*reload_cache_callback=*/cb_reload_table_metadata};
-      auto schema_lockfile{
-          std::filesystem::path(g_base_path) / shared::kLockfilesDirectoryName /
-          shared::kCatalogDirectoryName /
-          ("table_" + table_key_as_text + "." + T::kind.data() + ".lockfile")};
-      table_mutex = std::make_unique<heavyai::DistributedSharedMutex>(
-          schema_lockfile.string(), cbs);
-    } else if constexpr (T::kind == "data" || T::kind == "insert") {
-      // A callback used for reloading the DataMgr data for the one table being locked.
-      auto cb_reload_table_data = [table_key, table_key_as_text](size_t version) {
-        VLOG(2) << "invalidating table caches for new version " << version
-                << " of: table_" << table_key_as_text;
-        CHECK_EQ(table_key.size(), 2U);
-        auto cat = Catalog_Namespace::SysCatalog::instance().getCatalog(
-            table_key[CHUNK_KEY_DB_IDX]);
-        CHECK(cat);
-        cat->invalidateCachesForTable(table_key[CHUNK_KEY_TABLE_IDX]);
-      };
-
-      // Create the rows mutex.
-      auto rows_lockfile{std::filesystem::path(g_base_path) /
-                         shared::kLockfilesDirectoryName / shared::kDataDirectoryName /
-                         ("table_" + table_key_as_text + ".rows.lockfile")};
-      std::shared_ptr<heavyai::DistributedSharedMutex> rows_mutex =
-          std::make_shared<heavyai::DistributedSharedMutex>(
-              rows_lockfile.string(),
-              /*reload_cache_callback=*/cb_reload_table_data);
-
-      // A callback used for syncing with outside changes to this table's row data.
-      auto cb_reload_row_data = [table_key, rows_mutex](bool /*write*/) {
-        heavyai::shared_lock<heavyai::DistributedSharedMutex> rows_read_lock(*rows_mutex);
-      };
-
-      // A callback to notify other nodes about our changes to this table's row data.
-      auto cb_notify_about_row_data = [table_key, rows_mutex](bool write) {
-        if (write) {
-          heavyai::unique_lock<heavyai::DistributedSharedMutex> rows_write_lock(
-              *rows_mutex);
-        }
-      };
-
-      // Create the table mutex.
-      heavyai::DistributedSharedMutex::Callbacks cbs{
-          /*pre_lock_callback=*/cb_reload_catalog_metadata,
-          {},
-          /*post_lock_callback=*/cb_reload_row_data,
-          /*pre_unlock_callback=*/cb_notify_about_row_data};
-      auto table_lockfile{
-          std::filesystem::path(g_base_path) / shared::kLockfilesDirectoryName /
-          shared::kDataDirectoryName /
-          ("table_" + table_key_as_text + "." + T::kind.data() + ".lockfile")};
-      table_mutex =
-          std::make_unique<heavyai::DistributedSharedMutex>(table_lockfile.string(), cbs);
-    } else {
-      UNREACHABLE() << "unexpected lockmgr kind: " << T::kind;
-    }
-
-    return table_mutex;
-  }
+      const ChunkKey& table_key) const;
 
   mutable std::mutex map_mutex_;
   std::map<ChunkKey, std::unique_ptr<MutexTracker>> table_mutex_map_;
 
  private:
   static MutexTracker* getMutexTracker(const Catalog_Namespace::Catalog& catalog,
-                                       const std::string& table_name) {
-    ChunkKey chunk_key{catalog.getDatabaseId(),
-                       validateAndGetExistingTableId(catalog, table_name)};
-    auto& table_lock_mgr = T::instance();
-    MutexTracker* tracker = table_lock_mgr.getTableMutex(chunk_key);
-    CHECK(tracker);
-    return tracker;
-  }
+                                       const std::string& table_name);
 
   static void validateExistingTable(const Catalog_Namespace::Catalog& catalog,
-                                    const std::string& table_name) {
-    validateAndGetExistingTableId(catalog, table_name);
-  }
+                                    const std::string& table_name);
 
   static int32_t validateAndGetExistingTableId(const Catalog_Namespace::Catalog& catalog,
-                                               const std::string& table_name) {
-    auto table_id = catalog.getTableId(table_name);
-    if (!table_id.has_value()) {
-      throw Catalog_Namespace::TableNotFoundException(table_name, catalog.name());
-    }
-    return table_id.value();
-  }
+                                               const std::string& table_name);
 };
 
 template <typename T>
diff --git a/Logger/Logger.cpp b/Logger/Logger.cpp
index 36db997d8b..045320ece8 100644
--- a/Logger/Logger.cpp
+++ b/Logger/Logger.cpp
@@ -668,9 +668,11 @@ Duration* newDuration(DebugTimerParams const debug_timer_params) {
       return duration_tree_ptr->newDuration(debug_timer_params);
     }
     LOG(ERROR) << "DEBUG_TIMER(" << debug_timer_params.name_
-               << ") must not be called from the root thread(0) at "
+               << ") is being called on a child thread nested within an existing "
+                  "DEBUG_TIMER call at "
                << debug_timer_params.file_ << ':' << debug_timer_params.line_
-               << ". New threads require DEBUG_TIMER_NEW_THREAD() to be called first.";
+               << ".  New threads require DEBUG_TIMER_NEW_THREAD() to be called first to "
+                  "guarantee accurate reporting.";
   }
   return nullptr;  // Inactive - don't measure or report timing.
 }
diff --git a/Parser/ParserNode.cpp b/Parser/ParserNode.cpp
index 0865ee492a..7b16e8af45 100644
--- a/Parser/ParserNode.cpp
+++ b/Parser/ParserNode.cpp
@@ -46,6 +46,7 @@
 #include "Catalog/Catalog.h"
 #include "Catalog/DataframeTableDescriptor.h"
 #include "Catalog/SharedDictionaryValidator.h"
+#include "DataMgr/FileMgr/FileBuffer.h"
 #include "Fragmenter/InsertOrderFragmenter.h"
 #include "Fragmenter/SortedOrderFragmenter.h"
 #include "Fragmenter/TargetValueConvertersFactories.h"
@@ -1560,6 +1561,15 @@ void parse_copy_params(const std::list<std::unique_ptr<NameValueAssign>>& option
         if (bool_from_string_literal(str_literal)) {
           copy_params.raster_point_compute_angle = true;
         }
+      } else if (boost::iequals(*p->get_name(), "raster_drop_if_all_null")) {
+        const StringLiteral* str_literal =
+            dynamic_cast<const StringLiteral*>(p->get_value());
+        if (str_literal == nullptr) {
+          throw std::runtime_error("'raster_drop_if_all_null' option must be a boolean.");
+        }
+        if (bool_from_string_literal(str_literal)) {
+          copy_params.raster_drop_if_all_null = true;
+        }
       } else if (boost::iequals(*p->get_name(), "sql_order_by")) {
         if (auto str_literal = dynamic_cast<const StringLiteral*>(p->get_value())) {
           copy_params.sql_order_by = *str_literal->get_stringval();
@@ -2933,8 +2943,14 @@ decltype(auto) get_header_def(DataframeTableDescriptor& df_td,
 decltype(auto) get_page_size_def(TableDescriptor& td,
                                  const NameValueAssign* p,
                                  const std::list<ColumnDescriptor>& columns) {
-  return get_property_value<IntLiteral>(p,
-                                        [&td](const auto val) { td.fragPageSize = val; });
+  return get_property_value<IntLiteral>(p, [&td](const auto val) {
+    const auto min_page_size = File_Namespace::FileBuffer::getMinPageSize();
+    if (val < min_page_size) {
+      throw std::runtime_error("page_size cannot be less than " +
+                               std::to_string(min_page_size));
+    }
+    td.fragPageSize = val;
+  });
 }
 decltype(auto) get_max_rows_def(TableDescriptor& td,
                                 const NameValueAssign* p,
@@ -3908,6 +3924,7 @@ std::shared_ptr<ResultSet> getResultSet(QueryStateProxy query_state_proxy,
                          g_running_query_interrupt_freq,
                          g_pending_query_interrupt_freq,
                          g_optimize_cuda_block_and_grid_sizes,
+                         g_from_table_reordering,
                          false,
                          std::numeric_limits<size_t>::max(),
                          ExecutorType::Native,
@@ -4746,6 +4763,10 @@ void CreateTableAsSelectStmt::execute(const Catalog_Namespace::SessionInfo& sess
                                  std::to_string(sql_constants::kMaxNumericPrecision) +
                                  ".");
       }
+      // flatbuffer storage in real tables is not implemented, so we
+      // reset the flatbuffer storage flag for CTAS cases that select
+      // from UDTF columns which may use flatbuffer storage:
+      cd.columnType.setUsesFlatBuffer(false);
     }
 
     TableDescriptor td;
diff --git a/Parser/ParserNode.h b/Parser/ParserNode.h
index ad1434153f..f5889f75b6 100644
--- a/Parser/ParserNode.h
+++ b/Parser/ParserNode.h
@@ -2240,11 +2240,24 @@ struct PositiveOrZeroValidate {
   }
 };
 
+namespace {
+template <typename T>
+const std::string* validate_and_get_str(T name_value_assign) {
+  auto str = dynamic_cast<const StringLiteral*>(name_value_assign->get_value());
+  if (!str) {
+    auto option_name = name_value_assign->get_name();
+    CHECK(option_name);
+    throw std::runtime_error("The \"" + *option_name + "\" option must be a string.");
+  }
+  return str->get_stringval();
+}
+}  // namespace
+
 template <>
 struct DefaultValidate<StringLiteral> {
   template <typename T>
   decltype(auto) operator()(T t) {
-    const auto val = static_cast<const StringLiteral*>(t->get_value())->get_stringval();
+    const auto val = validate_and_get_str(t);
     CHECK(val);
     const auto val_upper = boost::to_upper_copy<std::string>(*val);
     return val_upper;
@@ -2254,7 +2267,7 @@ struct DefaultValidate<StringLiteral> {
 struct CaseSensitiveValidate {
   template <typename T>
   decltype(auto) operator()(T t) {
-    const auto val = static_cast<const StringLiteral*>(t->get_value())->get_stringval();
+    const auto val = validate_and_get_str(t);
     CHECK(val);
     return *val;
   }
diff --git a/Parser/ReservedKeywords.h b/Parser/ReservedKeywords.h
index c2dd0eaf37..ce7b1bfe3c 100644
--- a/Parser/ReservedKeywords.h
+++ b/Parser/ReservedKeywords.h
@@ -209,6 +209,7 @@ static std::set<std::string> reserved_keywords{
     "GROUP",
     "GROUPING",
     "GROUPS",
+    "HASH",
     "HAVING",
     "HOLD",
     "HOUR",
@@ -327,6 +328,7 @@ static std::set<std::string> reserved_keywords{
     "REF",
     "REFERENCES",
     "REFERENCING",
+    "REGEXP_COUNT",
     "REGEXP_REPLACE",
     "REGEXP_SUBSTR",
     "REGEXP_MATCH"
diff --git a/QueryEngine/ArithmeticIR.cpp b/QueryEngine/ArithmeticIR.cpp
index 081d942a4a..36e158c6c6 100644
--- a/QueryEngine/ArithmeticIR.cpp
+++ b/QueryEngine/ArithmeticIR.cpp
@@ -21,6 +21,8 @@
 
 // Code generation routines and helpers for basic arithmetic and unary minus.
 
+using heavyai::ErrorCode;
+
 namespace {
 
 std::string numeric_or_time_interval_type_name(const SQLTypeInfo& ti1,
@@ -270,7 +272,7 @@ llvm::Value* CodeGenerator::codegenAdd(const Analyzer::BinOper* bin_oper,
   if (need_overflow_check) {
     cgen_state_->ir_builder_.SetInsertPoint(add_fail);
     cgen_state_->ir_builder_.CreateRet(
-        cgen_state_->llInt(Executor::ERR_OVERFLOW_OR_UNDERFLOW));
+        cgen_state_->llInt(int32_t(ErrorCode::OVERFLOW_OR_UNDERFLOW)));
     cgen_state_->ir_builder_.SetInsertPoint(add_ok);
   }
   return ret;
@@ -334,7 +336,7 @@ llvm::Value* CodeGenerator::codegenSub(const Analyzer::BinOper* bin_oper,
   if (need_overflow_check) {
     cgen_state_->ir_builder_.SetInsertPoint(sub_fail);
     cgen_state_->ir_builder_.CreateRet(
-        cgen_state_->llInt(Executor::ERR_OVERFLOW_OR_UNDERFLOW));
+        cgen_state_->llInt(int32_t(ErrorCode::OVERFLOW_OR_UNDERFLOW)));
     cgen_state_->ir_builder_.SetInsertPoint(sub_ok);
   }
   return ret;
@@ -422,7 +424,7 @@ llvm::Value* CodeGenerator::codegenMul(const Analyzer::BinOper* bin_oper,
   if (need_overflow_check) {
     cgen_state_->ir_builder_.SetInsertPoint(mul_fail);
     cgen_state_->ir_builder_.CreateRet(
-        cgen_state_->llInt(Executor::ERR_OVERFLOW_OR_UNDERFLOW));
+        cgen_state_->llInt(int32_t(ErrorCode::OVERFLOW_OR_UNDERFLOW)));
     cgen_state_->ir_builder_.SetInsertPoint(mul_ok);
   }
   return ret;
@@ -473,7 +475,7 @@ llvm::Value* CodeGenerator::codegenDiv(llvm::Value* lhs_lv,
 
       cgen_state_->ir_builder_.SetInsertPoint(decimal_div_fail);
       cgen_state_->ir_builder_.CreateRet(
-          cgen_state_->llInt(Executor::ERR_OVERFLOW_OR_UNDERFLOW));
+          cgen_state_->llInt(int32_t(ErrorCode::OVERFLOW_OR_UNDERFLOW)));
 
       cgen_state_->ir_builder_.SetInsertPoint(decimal_div_ok);
 
@@ -531,7 +533,7 @@ llvm::Value* CodeGenerator::codegenDiv(llvm::Value* lhs_lv,
                         ti.get_type() == kFLOAT ? cgen_state_->llFp(NULL_FLOAT)
                                                 : cgen_state_->llFp(NULL_DOUBLE)}));
   cgen_state_->ir_builder_.SetInsertPoint(div_zero);
-  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt(Executor::ERR_DIV_BY_ZERO));
+  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt(int32_t(ErrorCode::DIV_BY_ZERO)));
   cgen_state_->ir_builder_.SetInsertPoint(div_ok);
   return ret;
 }
@@ -619,7 +621,7 @@ llvm::Value* CodeGenerator::codegenMod(llvm::Value* lhs_lv,
                        "mod_" + null_typename + null_check_suffix,
                        {lhs_lv, rhs_lv, cgen_state_->llInt(inline_int_null_val(ti))});
   cgen_state_->ir_builder_.SetInsertPoint(mod_zero);
-  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt(Executor::ERR_DIV_BY_ZERO));
+  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt(int32_t(ErrorCode::DIV_BY_ZERO)));
   cgen_state_->ir_builder_.SetInsertPoint(mod_ok);
   return ret;
 }
@@ -696,7 +698,7 @@ llvm::Value* CodeGenerator::codegenUMinus(const Analyzer::UOper* uoper,
   if (need_overflow_check) {
     cgen_state_->ir_builder_.SetInsertPoint(uminus_fail);
     cgen_state_->ir_builder_.CreateRet(
-        cgen_state_->llInt(Executor::ERR_OVERFLOW_OR_UNDERFLOW));
+        cgen_state_->llInt(int32_t(ErrorCode::OVERFLOW_OR_UNDERFLOW)));
     cgen_state_->ir_builder_.SetInsertPoint(uminus_ok);
   }
   return ret;
@@ -757,7 +759,7 @@ llvm::Value* CodeGenerator::codegenBinOpWithOverflowForCPU(
   cgen_state_->ir_builder_.CreateCondBr(overflow, check_fail, check_ok);
   cgen_state_->ir_builder_.SetInsertPoint(check_fail);
   cgen_state_->ir_builder_.CreateRet(
-      cgen_state_->llInt(Executor::ERR_OVERFLOW_OR_UNDERFLOW));
+      cgen_state_->llInt(int32_t(ErrorCode::OVERFLOW_OR_UNDERFLOW)));
 
   cgen_state_->ir_builder_.SetInsertPoint(check_ok);
 
diff --git a/QueryEngine/ArrayOps.cpp b/QueryEngine/ArrayOps.cpp
index 1b717521fd..c5b8ebc11c 100644
--- a/QueryEngine/ArrayOps.cpp
+++ b/QueryEngine/ArrayOps.cpp
@@ -79,6 +79,32 @@ extern "C" DEVICE RUNTIME_EXPORT bool point_coord_array_is_null(int8_t* chunk_it
   return ad.is_null;
 }
 
+extern "C" DEVICE RUNTIME_EXPORT int32_t
+point_coord_array_size(int8_t* chunk_iter_,
+                       const uint64_t row_pos,
+                       const uint32_t elem_log_sz) {
+  if (!chunk_iter_) {
+    return 0;
+  }
+  ChunkIter* chunk_iter = reinterpret_cast<ChunkIter*>(chunk_iter_);
+  ArrayDatum ad;
+  bool is_end;
+  ChunkIter_get_nth_point_coords(chunk_iter, row_pos, &ad, &is_end);
+  return ad.is_null ? 0 : ad.length >> elem_log_sz;
+}
+
+extern "C" DEVICE RUNTIME_EXPORT int32_t
+point_coord_array_size_nullable(int8_t* chunk_iter_,
+                                const uint64_t row_pos,
+                                const uint32_t elem_log_sz,
+                                const int32_t null_val) {
+  ChunkIter* chunk_iter = reinterpret_cast<ChunkIter*>(chunk_iter_);
+  ArrayDatum ad;
+  bool is_end;
+  ChunkIter_get_nth_point_coords(chunk_iter, row_pos, &ad, &is_end);
+  return ad.is_null ? null_val : ad.length >> elem_log_sz;
+}
+
 #define ARRAY_AT(type)                                                        \
   extern "C" DEVICE RUNTIME_EXPORT type array_at_##type(                      \
       int8_t* chunk_iter_, const uint64_t row_pos, const uint32_t elem_idx) { \
diff --git a/QueryEngine/CMakeLists.txt b/QueryEngine/CMakeLists.txt
index 011d0cf7ea..e3316ccd9c 100644
--- a/QueryEngine/CMakeLists.txt
+++ b/QueryEngine/CMakeLists.txt
@@ -427,10 +427,6 @@ set(QUERY_ENGINE_LIBS
 
 list(APPEND QUERY_ENGINE_LIBS ${llvm_libs} ${ZLIB_LIBRARIES})
 
-if(ENABLE_OMNIVERSE_CONNECTOR)
-  list(APPEND QUERY_ENGINE_LIBS poly2tri)
-endif()
-
 target_link_libraries(QueryEngine ${QUERY_ENGINE_LIBS})
 
 add_custom_command(
diff --git a/QueryEngine/CalciteAdapter.cpp b/QueryEngine/CalciteAdapter.cpp
index 9e30d75ffb..5f5f6267dd 100644
--- a/QueryEngine/CalciteAdapter.cpp
+++ b/QueryEngine/CalciteAdapter.cpp
@@ -17,25 +17,30 @@
 #include "CalciteAdapter.h"
 
 #include <boost/algorithm/string/predicate.hpp>
-#include <boost/algorithm/string/replace.hpp>
 
 #include "Logger/Logger.h"
 #include "Shared/StringTransform.h"
 #include "Shared/clean_boost_regex.hpp"
 
-namespace {
-
-std::string pg_shim_impl(const std::string& query) {
-  auto result = query;
-  {
+std::string pg_shim(std::string const& query) {
+  std::string result = query;
+  try {
     static const auto& unnest_expr = *new boost::regex(
         R"((\s+|,)(unnest)\s*\()", boost::regex::extended | boost::regex::icase);
     static_assert(std::is_trivially_destructible_v<decltype(unnest_expr)>);
     apply_shim(result, unnest_expr, [](std::string& result, const boost::smatch& what) {
       result.replace(what.position(), what.length(), what[1] + "PG_UNNEST(");
     });
+  } catch (const std::exception& e) {
+    // boost::regex throws an exception about the complexity of matching when
+    // the wrong type of quotes are used or they're mismatched. Let the query
+    // through unmodified
+    // this can be applied for all catch statements defined below
+    LOG(WARNING) << "Detect error while parsing PG_UNNEST: " << e.what();
+    return query;
   }
-  {
+
+  try {
     static const auto& cast_true_expr =
         *new boost::regex(R"(CAST\s*\(\s*'t'\s+AS\s+boolean\s*\))",
                           boost::regex::extended | boost::regex::icase);
@@ -44,8 +49,12 @@ std::string pg_shim_impl(const std::string& query) {
         result, cast_true_expr, [](std::string& result, const boost::smatch& what) {
           result.replace(what.position(), what.length(), "true");
         });
+  } catch (const std::exception& e) {
+    LOG(WARNING) << "Detect error while parsing CAST AS BOOLEAN(TRUE): " << e.what();
+    return query;
   }
-  {
+
+  try {
     static const auto& cast_false_expr =
         *new boost::regex(R"(CAST\s*\(\s*'f'\s+AS\s+boolean\s*\))",
                           boost::regex::extended | boost::regex::icase);
@@ -54,8 +63,12 @@ std::string pg_shim_impl(const std::string& query) {
         result, cast_false_expr, [](std::string& result, const boost::smatch& what) {
           result.replace(what.position(), what.length(), "false");
         });
+  } catch (const std::exception& e) {
+    LOG(WARNING) << "Detect error while parsing CAST AS BOOLEAN(FALSE): " << e.what();
+    return query;
   }
-  {
+
+  try {
     static const auto& ilike_expr = *new boost::regex(
         R"((\s+|\()((?!\()[^\s]+)\s+(not\s)?\s*ilike\s+('(?:[^']+|'')+')(\s+escape(\s+('[^']+')))?)",
         boost::regex::perl | boost::regex::icase);
@@ -67,8 +80,12 @@ std::string pg_shim_impl(const std::string& query) {
                      what[1] + what[3] + "PG_ILIKE(" + what[2] + ", " + what[4] +
                          (esc.empty() ? "" : ", " + esc) + ")");
     });
+  } catch (const std::exception& e) {
+    LOG(WARNING) << "Detect error while parsing PG_ILIKE: " << e.what();
+    return query;
   }
-  {
+
+  try {
     static const auto& regexp_expr = *new boost::regex(
         R"((\s+)([^\s]+)\s+REGEXP\s+('(?:[^']+|'')+')(\s+escape(\s+('[^']+')))?)",
         boost::regex::perl | boost::regex::icase);
@@ -80,8 +97,12 @@ std::string pg_shim_impl(const std::string& query) {
                      what[1] + "REGEXP_LIKE(" + what[2] + ", " + what[3] +
                          (esc.empty() ? "" : ", " + esc) + ")");
     });
+  } catch (const std::exception& e) {
+    LOG(WARNING) << "Detect error while parsing REGEXP_LIKE: " << e.what();
+    return query;
   }
-  {
+
+  try {
     // Comparison operator needed to distinguish from other uses of ALL (e.g. UNION ALL)
     static const auto& quant_expr =
         *new boost::regex(R"(([<=>]\s*)(any|all)\s+([^(\s|;)]+))",
@@ -92,8 +113,12 @@ std::string pg_shim_impl(const std::string& query) {
       result.replace(
           what.position(), what.length(), what[1] + quant_fname + what[3] + ')');
     });
+  } catch (const std::exception& e) {
+    LOG(WARNING) << "Detect error while parsing PG_ANY|PG_ALL: " << e.what();
+    return query;
   }
-  {
+
+  try {
     static const auto& immediate_cast_expr =
         *new boost::regex(R"(TIMESTAMP\(([0369])\)\s+('[^']+'))",
                           boost::regex::extended | boost::regex::icase);
@@ -104,8 +129,12 @@ std::string pg_shim_impl(const std::string& query) {
                          what.length(),
                          "CAST(" + what[2] + " AS TIMESTAMP(" + what[1] + "))");
         });
+  } catch (const std::exception& e) {
+    LOG(WARNING) << "Detect error while parsing CAST AS TIMESTAMP: " << e.what();
+    return query;
   }
-  {
+
+  try {
     static const auto& timestampadd_expr =
         *new boost::regex(R"(DATE(ADD|DIFF|PART|_TRUNC)\s*\(\s*(\w+)\s*,)",
                           boost::regex::extended | boost::regex::icase);
@@ -115,9 +144,12 @@ std::string pg_shim_impl(const std::string& query) {
           result.replace(
               what.position(), what.length(), "DATE" + what[1] + "('" + what[2] + "',");
         });
+  } catch (const std::exception& e) {
+    LOG(WARNING) << "Detect error while parsing DATE(ADD|DIFF|PART|_TRUNC): " << e.what();
+    return query;
   }
 
-  {
+  try {
     static const auto& pg_extract_expr = *new boost::regex(
         R"(PG_EXTRACT\s*\(\s*(\w+)\s*,)", boost::regex::extended | boost::regex::icase);
     static_assert(std::is_trivially_destructible_v<decltype(pg_extract_expr)>);
@@ -146,9 +178,12 @@ std::string pg_shim_impl(const std::string& query) {
                      what.length(),
                      "PG_EXTRACT('" + what[1] + "', " + what[2] + ")");
     });
+  } catch (const std::exception& e) {
+    LOG(WARNING) << "Detect error while parsing PG_EXTRACT: " << e.what();
+    return query;
   }
 
-  {
+  try {
     static const auto& date_trunc_expr = *new boost::regex(
         R"(([^_])date_trunc\s*)", boost::regex::extended | boost::regex::icase);
     static_assert(std::is_trivially_destructible_v<decltype(date_trunc_expr)>);
@@ -156,8 +191,11 @@ std::string pg_shim_impl(const std::string& query) {
         result, date_trunc_expr, [](std::string& result, const boost::smatch& what) {
           result.replace(what.position(), what.length(), what[1] + "PG_DATE_TRUNC");
         });
+  } catch (const std::exception& e) {
+    LOG(WARNING) << "Detect error while parsing PG_DATE_TRUNC: " << e.what();
+    return query;
   }
-  {
+  try {
     static const auto& timestampadd_expr_quoted =
         *new boost::regex(R"(TIMESTAMP(ADD|DIFF)\s*\(\s*'(\w+)'\s*,)",
                           boost::regex::extended | boost::regex::icase);
@@ -178,8 +216,11 @@ std::string pg_shim_impl(const std::string& query) {
           result.replace(
               what.position(), what.length(), "DATE" + what[1] + "('" + what[2] + "',");
         });
+  } catch (const std::exception& e) {
+    LOG(WARNING) << "Detect error while parsing TIMESTAMP(ADD|DIFF): " << e.what();
+    return query;
   }
-  {
+  try {
     static const auto& us_timestamp_cast_expr =
         *new boost::regex(R"(CAST\s*\(\s*('[^']+')\s*AS\s*TIMESTAMP\(6\)\s*\))",
                           boost::regex::extended | boost::regex::icase);
@@ -190,8 +231,11 @@ std::string pg_shim_impl(const std::string& query) {
                  result.replace(
                      what.position(), what.length(), "usTIMESTAMP(" + what[1] + ")");
                });
+  } catch (const std::exception& e) {
+    LOG(WARNING) << "Detect error while parsing usTIMESTAMP: " << e.what();
+    return query;
   }
-  {
+  try {
     static const auto& ns_timestamp_cast_expr =
         *new boost::regex(R"(CAST\s*\(\s*('[^']+')\s*AS\s*TIMESTAMP\(9\)\s*\))",
                           boost::regex::extended | boost::regex::icase);
@@ -202,36 +246,39 @@ std::string pg_shim_impl(const std::string& query) {
                  result.replace(
                      what.position(), what.length(), "nsTIMESTAMP(" + what[1] + ")");
                });
+  } catch (const std::exception& e) {
+    LOG(WARNING) << "Detect error while parsing nsTIMESTAMP: " << e.what();
+    return query;
   }
-  {
+  try {
     static const auto& corr_expr = *new boost::regex(
         R"((\s+|,|\()(corr)\s*\()", boost::regex::extended | boost::regex::icase);
     static_assert(std::is_trivially_destructible_v<decltype(corr_expr)>);
     apply_shim(result, corr_expr, [](std::string& result, const boost::smatch& what) {
       result.replace(what.position(), what.length(), what[1] + "CORRELATION(");
     });
+  } catch (const std::exception& e) {
+    LOG(WARNING) << "Detect error while parsing CORRELATION: " << e.what();
+    return query;
   }
-  {
-    try {
-      // the geography regex pattern is expensive and can sometimes run out of stack space
-      // on long queries. Treat it separately from the other shims.
-      static const auto& cast_to_geography_expr =
-          *new boost::regex(R"(CAST\s*\(\s*(((?!geography).)+)\s+AS\s+geography\s*\))",
-                            boost::regex::perl | boost::regex::icase);
-      static_assert(std::is_trivially_destructible_v<decltype(cast_to_geography_expr)>);
-      apply_shim(result,
-                 cast_to_geography_expr,
-                 [](std::string& result, const boost::smatch& what) {
-                   result.replace(what.position(),
-                                  what.length(),
-                                  "CastToGeography(" + what[1] + ")");
-                 });
-    } catch (const std::exception& e) {
-      LOG(WARNING) << "Error apply geography cast shim: " << e.what()
-                   << "\nContinuing query parse...";
-    }
+  try {
+    // the geography regex pattern is expensive and can sometimes run out of stack
+    // space on long queries. Treat it separately from the other shims.
+    static const auto& cast_to_geography_expr =
+        *new boost::regex(R"(CAST\s*\(\s*(((?!geography).)+)\s+AS\s+geography\s*\))",
+                          boost::regex::perl | boost::regex::icase);
+    static_assert(std::is_trivially_destructible_v<decltype(cast_to_geography_expr)>);
+    apply_shim(result,
+               cast_to_geography_expr,
+               [](std::string& result, const boost::smatch& what) {
+                 result.replace(
+                     what.position(), what.length(), "CastToGeography(" + what[1] + ")");
+               });
+  } catch (const std::exception& e) {
+    LOG(WARNING) << "Detect error while parsing CastToGeography: " << e.what();
+    return query;
   }
-  {
+  try {
     static const auto& interval_subsecond_expr =
         *new boost::regex(R"(interval\s+([0-9]+)\s+(millisecond|microsecond|nanosecond))",
                           boost::regex::extended | boost::regex::icase);
@@ -263,21 +310,9 @@ std::string pg_shim_impl(const std::string& query) {
                 what.position(), what.length(), "interval " + interval_str + " second");
           }
         });
-  }
-
-  return result;
-}
-
-}  // namespace
-
-std::string pg_shim(const std::string& query) {
-  try {
-    return pg_shim_impl(query);
   } catch (const std::exception& e) {
-    LOG(WARNING) << "Error applying shim: " << e.what() << "\nContinuing query parse...";
-    // boost::regex throws an exception about the complexity of matching when
-    // the wrong type of quotes are used or they're mismatched. Let the query
-    // through unmodified, the parser will throw a much more informative error.
+    LOG(WARNING) << "Detect error while parsing INTERVAL: " << e.what();
+    return query;
   }
-  return query;
+  return result;
 }
diff --git a/QueryEngine/CalciteAdapter.h b/QueryEngine/CalciteAdapter.h
index bec3680b64..53cdea6830 100644
--- a/QueryEngine/CalciteAdapter.h
+++ b/QueryEngine/CalciteAdapter.h
@@ -18,4 +18,4 @@
 
 #include <string>
 
-std::string pg_shim(const std::string&);
+std::string pg_shim(std::string const&);
diff --git a/QueryEngine/CardinalityEstimator.cpp b/QueryEngine/CardinalityEstimator.cpp
index 75eb3b4bf8..436af26dcc 100644
--- a/QueryEngine/CardinalityEstimator.cpp
+++ b/QueryEngine/CardinalityEstimator.cpp
@@ -75,10 +75,10 @@ size_t RelAlgExecutor::getNDVEstimation(const WorkUnit& work_unit,
     }
     return estimator_result->getNDVEstimator();
   } catch (const QueryExecutionError& e) {
-    if (e.getErrorCode() == Executor::ERR_OUT_OF_TIME) {
+    if (e.hasErrorCode(ErrorCode::OUT_OF_TIME)) {
       throw std::runtime_error("Cardinality estimation query ran out of time");
     }
-    if (e.getErrorCode() == Executor::ERR_INTERRUPTED) {
+    if (e.hasErrorCode(ErrorCode::INTERRUPTED)) {
       throw std::runtime_error("Cardinality estimation query has been interrupted");
     }
     throw std::runtime_error("Failed to run the cardinality estimation query: " +
diff --git a/QueryEngine/CastIR.cpp b/QueryEngine/CastIR.cpp
index e5a77b2335..3964f493cc 100644
--- a/QueryEngine/CastIR.cpp
+++ b/QueryEngine/CastIR.cpp
@@ -550,7 +550,7 @@ void CodeGenerator::codegenCastBetweenIntTypesOverflowChecks(
 
   cgen_state_->ir_builder_.SetInsertPoint(cast_fail);
   cgen_state_->ir_builder_.CreateRet(
-      cgen_state_->llInt(Executor::ERR_OVERFLOW_OR_UNDERFLOW));
+      cgen_state_->llInt(int32_t(heavyai::ErrorCode::OVERFLOW_OR_UNDERFLOW)));
 
   cgen_state_->ir_builder_.SetInsertPoint(cast_ok);
 }
diff --git a/QueryEngine/CodeGenerator.h b/QueryEngine/CodeGenerator.h
index 940e639da1..3d25df49df 100644
--- a/QueryEngine/CodeGenerator.h
+++ b/QueryEngine/CodeGenerator.h
@@ -23,6 +23,7 @@
 #include "Shared/DbObjectKeys.h"
 
 class AbstractMLModel;
+class AbstractTreeModel;
 
 // Code generation utility to be used for queries and scalar expressions.
 class CodeGenerator {
@@ -233,13 +234,11 @@ class CodeGenerator {
   llvm::Value* codegen(const Analyzer::PCAProjectExpr*, const CompilationOptions&);
 
   llvm::Value* codegenLinRegPredict(const Analyzer::MLPredictExpr*,
-                                    const std::string& model_name,
                                     const std::shared_ptr<AbstractMLModel>& model,
                                     const CompilationOptions&);
 
   llvm::Value* codegenTreeRegPredict(const Analyzer::MLPredictExpr*,
-                                     const std::string& model_name,
-                                     const std::shared_ptr<AbstractMLModel>& model,
+                                     const std::shared_ptr<AbstractTreeModel>& tree_model,
                                      const CompilationOptions&);
 
   llvm::Value* codegen(const Analyzer::StringOper*, const CompilationOptions&);
diff --git a/QueryEngine/CodegenHelper.cpp b/QueryEngine/CodegenHelper.cpp
index 14e7d017df..cb31dd60e2 100644
--- a/QueryEngine/CodegenHelper.cpp
+++ b/QueryEngine/CodegenHelper.cpp
@@ -72,4 +72,18 @@ std::vector<llvm::Value*> createPtrWithHoistedMemoryAddr(
   return hoisted_ptrs;
 }
 
+// todo (yoonmin): support String literal
+std::vector<llvm::Value*> hoistLiteral(CodeGenerator* code_generator,
+                                       CompilationOptions const& co,
+                                       Datum d,
+                                       SQLTypeInfo type,
+                                       size_t num_devices_to_hoist_literal) {
+  CHECK(co.hoist_literals);
+  CHECK(type.is_integer() || type.is_decimal() || type.is_fp() || type.is_boolean());
+  auto literal_expr = makeExpr<Analyzer::Constant>(type, false, d);
+  std::vector<Analyzer::Constant const*> literals(num_devices_to_hoist_literal,
+                                                  literal_expr.get());
+  return code_generator->codegenHoistedConstants(literals, kENCODING_NONE, {});
+}
+
 }  // namespace CodegenUtil
diff --git a/QueryEngine/CodegenHelper.h b/QueryEngine/CodegenHelper.h
index a27376682f..e907ae9d10 100644
--- a/QueryEngine/CodegenHelper.h
+++ b/QueryEngine/CodegenHelper.h
@@ -36,5 +36,10 @@ std::vector<llvm::Value*> createPtrWithHoistedMemoryAddr(
     llvm::ConstantInt* ptr,
     llvm::Type* type,
     size_t num_devices_to_hoist_literal);
+std::vector<llvm::Value*> hoistLiteral(CodeGenerator* code_generator,
+                                       CompilationOptions const& co,
+                                       Datum d,
+                                       SQLTypeInfo type,
+                                       size_t num_devices_to_hoist_literal);
 
 }  // namespace CodegenUtil
diff --git a/QueryEngine/ColumnFetcher.cpp b/QueryEngine/ColumnFetcher.cpp
index 71c2a0de7b..a2ce8d7dd4 100644
--- a/QueryEngine/ColumnFetcher.cpp
+++ b/QueryEngine/ColumnFetcher.cpp
@@ -62,7 +62,7 @@ std::string getMemoryLevelString(Data_Namespace::MemoryLevel memoryLevel) {
 }
 }  // namespace
 
-ColumnFetcher::ColumnFetcher(Executor* executor, const ColumnCacheMap& column_cache)
+ColumnFetcher::ColumnFetcher(Executor* executor, ColumnCacheMap& column_cache)
     : executor_(executor), columnarized_table_cache_(column_cache) {}
 
 //! Gets a column fragment chunk on CPU or on GPU depending on the effective
@@ -130,8 +130,8 @@ std::pair<const int8_t*, size_t> ColumnFetcher::getOneColumnFragment(
             std::shared_ptr<const ColumnarResults>(columnarize_result(
                 executor->row_set_mem_owner_,
                 get_temporary_table(executor->temporary_tables_, table_key.table_id),
-                executor->executor_id_,
                 thread_idx,
+                executor->executor_id_,
                 frag_id))));
       }
       col_frag = column_cache[table_key][frag_id].get();
@@ -180,7 +180,7 @@ JoinColumn ColumnFetcher::makeJoinColumn(
   for (auto& frag : fragments) {
     if (g_enable_non_kernel_time_query_interrupt &&
         executor->checkNonKernelTimeInterrupted()) {
-      throw QueryExecutionError(Executor::ERR_INTERRUPTED);
+      throw QueryExecutionError(ErrorCode::INTERRUPTED);
     }
     auto [col_buff, elem_count] = getOneColumnFragment(
         executor,
@@ -308,7 +308,7 @@ const int8_t* ColumnFetcher::getAllTableColumnFragments(
       for (size_t frag_id = 0; frag_id < frag_count; ++frag_id) {
         if (g_enable_non_kernel_time_query_interrupt &&
             executor_->checkNonKernelTimeInterrupted()) {
-          throw QueryExecutionError(Executor::ERR_INTERRUPTED);
+          throw QueryExecutionError(ErrorCode::INTERRUPTED);
         }
         std::list<std::shared_ptr<Chunk_NS::Chunk>> chunk_holder;
         std::list<ChunkIter> chunk_iter_holder;
@@ -711,7 +711,7 @@ MergedChunk ColumnFetcher::linearizeVarLenArrayColFrags(
        chunk_holder_it++, chunk_iter_holder_it++, chunk_num_tuple_it++) {
     if (g_enable_non_kernel_time_query_interrupt &&
         executor_->checkNonKernelTimeInterrupted()) {
-      throw QueryExecutionError(Executor::ERR_INTERRUPTED);
+      throw QueryExecutionError(ErrorCode::INTERRUPTED);
     }
     auto target_chunk = chunk_holder_it->get();
     auto target_chunk_data_buffer = target_chunk->getBuffer();
@@ -965,7 +965,7 @@ MergedChunk ColumnFetcher::linearizeFixedLenArrayColFrags(
     for (; chunk_holder_it != local_chunk_holder.end();
          chunk_holder_it++, chunk_iter_holder_it++) {
       if (g_enable_non_kernel_time_query_interrupt && check_interrupt()) {
-        throw QueryExecutionError(Executor::ERR_INTERRUPTED);
+        throw QueryExecutionError(ErrorCode::INTERRUPTED);
       }
       auto target_chunk = chunk_holder_it->get();
       auto target_chunk_data_buffer = target_chunk->getBuffer();
@@ -1133,8 +1133,8 @@ const int8_t* ColumnFetcher::getResultSetColumn(
                          std::shared_ptr<const ColumnarResults>(
                              columnarize_result(executor_->row_set_mem_owner_,
                                                 buffer,
-                                                executor_->executor_id_,
                                                 thread_idx,
+                                                executor_->executor_id_,
                                                 frag_id))));
     }
     CHECK_NE(size_t(0), columnarized_table_cache_.count(table_key));
diff --git a/QueryEngine/ColumnFetcher.h b/QueryEngine/ColumnFetcher.h
index 2eae22bca5..413b3d35ad 100644
--- a/QueryEngine/ColumnFetcher.h
+++ b/QueryEngine/ColumnFetcher.h
@@ -48,7 +48,7 @@ using MergedChunk = std::pair<AbstractBuffer*, AbstractBuffer*>;
 
 class ColumnFetcher {
  public:
-  ColumnFetcher(Executor* executor, const ColumnCacheMap& column_cache);
+  ColumnFetcher(Executor* executor, ColumnCacheMap& column_cache);
 
   //! Gets one chunk's pointer and element count on either CPU or GPU.
   static std::pair<const int8_t*, size_t> getOneColumnFragment(
@@ -183,7 +183,7 @@ class ColumnFetcher {
   mutable std::mutex linearization_mutex_;
   mutable std::mutex chunk_list_mutex_;
   mutable std::mutex linearized_col_cache_mutex_;
-  mutable ColumnCacheMap columnarized_table_cache_;
+  ColumnCacheMap& columnarized_table_cache_;
   mutable std::unordered_map<InputColDescriptor, std::unique_ptr<const ColumnarResults>>
       columnarized_scan_table_cache_;
   using DeviceMergedChunkIterMap = std::unordered_map<int, int8_t*>;
diff --git a/QueryEngine/ColumnarResults.cpp b/QueryEngine/ColumnarResults.cpp
index e4087fbab3..2536f12208 100644
--- a/QueryEngine/ColumnarResults.cpp
+++ b/QueryEngine/ColumnarResults.cpp
@@ -494,7 +494,7 @@ void ColumnarResults::materializeAllColumnsThroughIteration(const ResultSet& row
               for (size_t i = start; i < end; ++i, ++local_idx) {
                 if (UNLIKELY((local_idx & 0xFFFF) == 0 &&
                              executor_->checkNonKernelTimeInterrupted())) {
-                  throw QueryExecutionError(Executor::ERR_INTERRUPTED);
+                  throw QueryExecutionError(ErrorCode::INTERRUPTED);
                 }
                 do_work(i);
               }
@@ -513,8 +513,8 @@ void ColumnarResults::materializeAllColumnsThroughIteration(const ResultSet& row
         child.wait();
       }
     } catch (QueryExecutionError& e) {
-      if (e.getErrorCode() == Executor::ERR_INTERRUPTED) {
-        throw QueryExecutionError(Executor::ERR_INTERRUPTED);
+      if (e.hasErrorCode(ErrorCode::INTERRUPTED)) {
+        throw QueryExecutionError(ErrorCode::INTERRUPTED);
       }
       throw e;
     } catch (...) {
@@ -543,7 +543,7 @@ void ColumnarResults::materializeAllColumnsThroughIteration(const ResultSet& row
     while (!done) {
       if (UNLIKELY((row_idx & 0xFFFF) == 0 &&
                    executor_->checkNonKernelTimeInterrupted())) {
-        throw QueryExecutionError(Executor::ERR_INTERRUPTED);
+        throw QueryExecutionError(ErrorCode::INTERRUPTED);
       }
       do_work();
     }
@@ -1205,7 +1205,7 @@ void ColumnarResults::materializeAllLazyColumns(
               for (size_t i = start; i < end; ++i, ++local_idx) {
                 if (UNLIKELY((local_idx & 0xFFFF) == 0 &&
                              executor_->checkNonKernelTimeInterrupted())) {
-                  throw QueryExecutionError(Executor::ERR_INTERRUPTED);
+                  throw QueryExecutionError(ErrorCode::INTERRUPTED);
                 }
                 do_work_just_lazy_columns(i, targets_to_skip);
               }
@@ -1224,8 +1224,8 @@ void ColumnarResults::materializeAllLazyColumns(
         child.wait();
       }
     } catch (QueryExecutionError& e) {
-      if (e.getErrorCode() == Executor::ERR_INTERRUPTED) {
-        throw QueryExecutionError(Executor::ERR_INTERRUPTED);
+      if (e.hasErrorCode(ErrorCode::INTERRUPTED)) {
+        throw QueryExecutionError(ErrorCode::INTERRUPTED);
       }
       throw e;
     } catch (...) {
@@ -1304,7 +1304,7 @@ void ColumnarResults::locateAndCountEntries(const ResultSet& rows,
                entry_idx++, local_idx++) {
             if (UNLIKELY((local_idx & 0xFFFF) == 0 &&
                          executor_->checkNonKernelTimeInterrupted())) {
-              throw QueryExecutionError(Executor::ERR_INTERRUPTED);
+              throw QueryExecutionError(ErrorCode::INTERRUPTED);
             }
             do_work(total_non_empty, local_idx, entry_idx, thread_idx);
           }
@@ -1330,8 +1330,8 @@ void ColumnarResults::locateAndCountEntries(const ResultSet& rows,
       child.wait();
     }
   } catch (QueryExecutionError& e) {
-    if (e.getErrorCode() == Executor::ERR_INTERRUPTED) {
-      throw QueryExecutionError(Executor::ERR_INTERRUPTED);
+    if (e.hasErrorCode(ErrorCode::INTERRUPTED)) {
+      throw QueryExecutionError(ErrorCode::INTERRUPTED);
     }
     throw e;
   } catch (...) {
@@ -1485,7 +1485,7 @@ void ColumnarResults::compactAndCopyEntriesWithTargetSkipping(
            entry_idx++, local_idx++) {
         if (UNLIKELY((local_idx & 0xFFFF) == 0 &&
                      executor_->checkNonKernelTimeInterrupted())) {
-          throw QueryExecutionError(Executor::ERR_INTERRUPTED);
+          throw QueryExecutionError(ErrorCode::INTERRUPTED);
         }
         do_work(
             non_empty_idx, total_non_empty, local_idx, entry_idx, thread_idx, end_index);
@@ -1512,8 +1512,8 @@ void ColumnarResults::compactAndCopyEntriesWithTargetSkipping(
       child.wait();
     }
   } catch (QueryExecutionError& e) {
-    if (e.getErrorCode() == Executor::ERR_INTERRUPTED) {
-      throw QueryExecutionError(Executor::ERR_INTERRUPTED);
+    if (e.hasErrorCode(ErrorCode::INTERRUPTED)) {
+      throw QueryExecutionError(ErrorCode::INTERRUPTED);
     }
     throw e;
   } catch (...) {
@@ -1587,7 +1587,7 @@ void ColumnarResults::compactAndCopyEntriesWithoutTargetSkipping(
            entry_idx++, local_idx++) {
         if (UNLIKELY((local_idx & 0xFFFF) == 0 &&
                      executor_->checkNonKernelTimeInterrupted())) {
-          throw QueryExecutionError(Executor::ERR_INTERRUPTED);
+          throw QueryExecutionError(ErrorCode::INTERRUPTED);
         }
         do_work(
             entry_idx, non_empty_idx, total_non_empty, local_idx, thread_idx, end_index);
@@ -1614,8 +1614,8 @@ void ColumnarResults::compactAndCopyEntriesWithoutTargetSkipping(
       child.wait();
     }
   } catch (QueryExecutionError& e) {
-    if (e.getErrorCode() == Executor::ERR_INTERRUPTED) {
-      throw QueryExecutionError(Executor::ERR_INTERRUPTED);
+    if (e.hasErrorCode(ErrorCode::INTERRUPTED)) {
+      throw QueryExecutionError(ErrorCode::INTERRUPTED);
     }
     throw e;
   } catch (...) {
diff --git a/QueryEngine/CompareIR.cpp b/QueryEngine/CompareIR.cpp
index 7ae5d8af04..213a3b0058 100644
--- a/QueryEngine/CompareIR.cpp
+++ b/QueryEngine/CompareIR.cpp
@@ -386,9 +386,7 @@ llvm::Value* CodeGenerator::codegenStrCmp(const SQLOps optype,
       rhs_ti.get_compression() == kENCODING_DICT) {
     if (lhs_ti.getStringDictKey() == rhs_ti.getStringDictKey()) {
       // Both operands share a dictionary
-
-      // check if query is trying to compare a columnt against literal
-
+      // check if query is trying to compare a column against literal
       auto ir = codegenDictStrCmp(lhs, rhs, optype, co);
       if (ir) {
         return ir;
@@ -454,6 +452,33 @@ llvm::Value* CodeGenerator::codegenCmpDecimalConst(const SQLOps optype,
   return codegenCmp(optype, qualifier, {lhs_lv}, new_ti, new_rhs_lit.get(), co);
 }
 
+namespace {
+void unpack_none_encoded_string(CgenState* cgen_state, std::vector<llvm::Value*>& lvs) {
+  if (lvs.size() != 3) {
+    CHECK_EQ(size_t(1), lvs.size());
+    lvs.push_back(cgen_state->ir_builder_.CreateExtractValue(lvs[0], 0));
+    lvs.push_back(cgen_state->ir_builder_.CreateExtractValue(lvs[0], 1));
+    lvs.back() = cgen_state->ir_builder_.CreateTrunc(
+        lvs.back(), llvm::Type::getInt32Ty(cgen_state->context_));
+  }
+  CHECK_EQ(lvs.size(), size_t(3));
+}
+void unpack_dict_encoded_string(CgenState* cgen_state,
+                                Executor* executor,
+                                SQLTypeInfo const ti,
+                                llvm::StructType* string_view_struct_type,
+                                std::vector<llvm::Value*>& lvs) {
+  const auto sdp_ptr = reinterpret_cast<int64_t>(executor->getStringDictionaryProxy(
+      ti.getStringDictKey(), executor->getRowSetMemoryOwner(), true));
+  const auto sv = cgen_state->emitExternalCall(
+      "string_decompress", string_view_struct_type, {lvs[0], cgen_state->llInt(sdp_ptr)});
+  lvs.push_back(cgen_state->ir_builder_.CreateExtractValue(sv, 0));
+  lvs.push_back(cgen_state->ir_builder_.CreateExtractValue(sv, 1));
+  lvs.back() = cgen_state->ir_builder_.CreateTrunc(
+      lvs.back(), llvm::Type::getInt32Ty(cgen_state->context_));
+}
+}  // namespace
+
 llvm::Value* CodeGenerator::codegenCmp(const SQLOps optype,
                                        const SQLQualifier qualifier,
                                        std::vector<llvm::Value*> lhs_lvs,
@@ -481,23 +506,31 @@ llvm::Value* CodeGenerator::codegenCmp(const SQLOps optype,
       lhs_ti.is_boolean() || lhs_ti.is_string() || lhs_ti.is_timeinterval()) {
     if (lhs_ti.is_string()) {
       CHECK(rhs_ti.is_string());
+      // we sync two string col's encoding scheme
+      // if one of them is dict-encoded before reaching here,
+      // i.e., call `codegenCastNonStringToString` or `codegenCastFromString`
       CHECK_EQ(lhs_ti.get_compression(), rhs_ti.get_compression());
+      bool unpack_strings = true;
       if (lhs_ti.get_compression() == kENCODING_NONE) {
-        // unpack pointer + length if necessary
-        if (lhs_lvs.size() != 3) {
-          CHECK_EQ(size_t(1), lhs_lvs.size());
-          lhs_lvs.push_back(cgen_state_->ir_builder_.CreateExtractValue(lhs_lvs[0], 0));
-          lhs_lvs.push_back(cgen_state_->ir_builder_.CreateExtractValue(lhs_lvs[0], 1));
-          lhs_lvs.back() = cgen_state_->ir_builder_.CreateTrunc(
-              lhs_lvs.back(), llvm::Type::getInt32Ty(cgen_state_->context_));
-        }
-        if (rhs_lvs.size() != 3) {
-          CHECK_EQ(size_t(1), rhs_lvs.size());
-          rhs_lvs.push_back(cgen_state_->ir_builder_.CreateExtractValue(rhs_lvs[0], 0));
-          rhs_lvs.push_back(cgen_state_->ir_builder_.CreateExtractValue(rhs_lvs[0], 1));
-          rhs_lvs.back() = cgen_state_->ir_builder_.CreateTrunc(
-              rhs_lvs.back(), llvm::Type::getInt32Ty(cgen_state_->context_));
+        unpack_none_encoded_string(cgen_state_, lhs_lvs);
+        unpack_none_encoded_string(cgen_state_, rhs_lvs);
+      } else if (lhs_ti.get_compression() == kENCODING_DICT) {
+        if (IS_EQUIVALENCE(optype) || optype == kNE) {
+          // we use `StringDictionaryTranslationMgr` to translate
+          // dict-encoded lhs against rhs's string dictionary
+          // and then compare their string ids without unpacking strings
+          // i.e., call `eq_int32_t_nullable` instead of `string_eq_nullable`
+          unpack_strings = false;
+        } else {
+          auto sv_struct_type_lv = createStringViewStructType();
+          unpack_dict_encoded_string(
+              cgen_state_, executor_, lhs_ti, sv_struct_type_lv, lhs_lvs);
+          unpack_dict_encoded_string(
+              cgen_state_, executor_, rhs_ti, sv_struct_type_lv, rhs_lvs);
         }
+      }
+      if (unpack_strings) {
+        // we directly compare two unpacked (i.e., raw) strings
         std::vector<llvm::Value*> str_cmp_args{
             lhs_lvs[1], lhs_lvs[2], rhs_lvs[1], rhs_lvs[2]};
         if (!null_check_suffix.empty()) {
@@ -507,8 +540,6 @@ llvm::Value* CodeGenerator::codegenCmp(const SQLOps optype,
         return cgen_state_->emitCall(
             string_cmp_func(optype) + (null_check_suffix.empty() ? "" : "_nullable"),
             str_cmp_args);
-      } else {
-        CHECK(optype == kEQ || optype == kNE);
       }
     }
 
diff --git a/QueryEngine/CompilationOptions.h b/QueryEngine/CompilationOptions.h
index 8b30d2bdfa..68d3301f05 100644
--- a/QueryEngine/CompilationOptions.h
+++ b/QueryEngine/CompilationOptions.h
@@ -20,6 +20,8 @@
 #include <vector>
 #include "ExecutorDeviceType.h"
 
+extern bool g_from_table_reordering;
+
 enum class ExecutorOptLevel { Default, ReductionJIT };
 
 enum class ExecutorExplainType { Default, Optimized };
@@ -81,6 +83,7 @@ struct ExecutionOptions {
   double running_query_interrupt_freq;
   unsigned pending_query_interrupt_freq;
   bool optimize_cuda_block_and_grid_sizes;
+  bool table_reordering{g_from_table_reordering};
   bool estimate_output_cardinality{false};
   size_t max_join_hash_table_size = std::numeric_limits<size_t>::max();
   ExecutorType executor_type = ExecutorType::Native;
diff --git a/QueryEngine/Descriptors/ApproxQuantileDescriptor.h b/QueryEngine/Descriptors/ApproxQuantileDescriptor.h
new file mode 100644
index 0000000000..37fbe97564
--- /dev/null
+++ b/QueryEngine/Descriptors/ApproxQuantileDescriptor.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2023 HEAVY.AI, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <vector>
+
+struct ApproxQuantileDescriptor {
+  size_t buffer_size;     // number of elements in TDigest buffer
+  size_t centroids_size;  // number of elements in TDigest centroids
+};
+
+using ApproxQuantileDescriptors = std::vector<ApproxQuantileDescriptor>;
diff --git a/QueryEngine/Descriptors/QueryMemoryDescriptor.cpp b/QueryEngine/Descriptors/QueryMemoryDescriptor.cpp
index b58dede072..ac18594e6c 100644
--- a/QueryEngine/Descriptors/QueryMemoryDescriptor.cpp
+++ b/QueryEngine/Descriptors/QueryMemoryDescriptor.cpp
@@ -250,6 +250,7 @@ std::unique_ptr<QueryMemoryDescriptor> QueryMemoryDescriptor::init(
     const size_t shard_count,
     const size_t max_groups_buffer_entry_count,
     RenderInfo* render_info,
+    const ApproxQuantileDescriptors& approx_quantile_descriptors,
     const CountDistinctDescriptors count_distinct_descriptors,
     const bool must_use_baseline_sort,
     const bool output_columnar_hint,
@@ -288,6 +289,7 @@ std::unique_ptr<QueryMemoryDescriptor> QueryMemoryDescriptor::init(
         /*group_col_compact_width=*/0,
         std::vector<int64_t>{},
         /*entry_count=*/1,
+        approx_quantile_descriptors,
         count_distinct_descriptors,
         false,
         output_columnar_hint,
@@ -428,6 +430,7 @@ std::unique_ptr<QueryMemoryDescriptor> QueryMemoryDescriptor::init(
                                                  group_col_compact_width,
                                                  target_groupby_indices,
                                                  entry_count,
+                                                 approx_quantile_descriptors,
                                                  count_distinct_descriptors,
                                                  sort_on_gpu_hint,
                                                  output_columnar,
@@ -461,6 +464,7 @@ QueryMemoryDescriptor::QueryMemoryDescriptor(
     const int8_t group_col_compact_width,
     const std::vector<int64_t>& target_groupby_indices,
     const size_t entry_count,
+    const ApproxQuantileDescriptors& approx_quantile_descriptors,
     const CountDistinctDescriptors count_distinct_descriptors,
     const bool sort_on_gpu_hint,
     const bool output_columnar_hint,
@@ -482,6 +486,7 @@ QueryMemoryDescriptor::QueryMemoryDescriptor(
     , max_val_(col_range_info.max)
     , bucket_(col_range_info.bucket)
     , has_nulls_(col_range_info.has_nulls)
+    , approx_quantile_descriptors_(approx_quantile_descriptors)
     , count_distinct_descriptors_(count_distinct_descriptors)
     , output_columnar_(false)
     , render_output_(render_output)
diff --git a/QueryEngine/Descriptors/QueryMemoryDescriptor.h b/QueryEngine/Descriptors/QueryMemoryDescriptor.h
index b5db1099de..2f07d0c4d0 100644
--- a/QueryEngine/Descriptors/QueryMemoryDescriptor.h
+++ b/QueryEngine/Descriptors/QueryMemoryDescriptor.h
@@ -23,13 +23,16 @@
 #ifndef QUERYENGINE_QUERYMEMORYDESCRIPTOR_H
 #define QUERYENGINE_QUERYMEMORYDESCRIPTOR_H
 
-#include "../CompilationOptions.h"
-#include "../CountDistinct.h"
+#include "ApproxQuantileDescriptor.h"
 #include "ColSlotContext.h"
-#include "Types.h"
+#include "Logger/Logger.h"
+#include "QueryEngine/CompilationOptions.h"
+#include "QueryEngine/CountDistinct.h"
+#include "QueryEngine/enums.h"
+#include "Shared/SqlTypesLayout.h"
+#include "Shared/TargetInfo.h"
 
 #include <boost/optional.hpp>
-#include "Logger/Logger.h"
 
 #include <algorithm>
 #include <cstddef>
@@ -39,9 +42,6 @@
 #include <unordered_map>
 #include <vector>
 
-#include <Shared/SqlTypesLayout.h>
-#include <Shared/TargetInfo.h>
-
 extern bool g_cluster;
 
 class Executor;
@@ -55,6 +55,8 @@ class GroupByAndAggregate;
 struct ColRangeInfo;
 struct KeylessInfo;
 
+using heavyai::QueryDescriptionType;
+
 class StreamingTopNOOM : public std::runtime_error {
  public:
   StreamingTopNOOM(const size_t heap_size_bytes)
@@ -81,7 +83,8 @@ class QueryMemoryDescriptor {
                         const int8_t group_col_compact_width,
                         const std::vector<int64_t>& target_groupby_indices,
                         const size_t entry_count,
-                        const CountDistinctDescriptors count_distinct_descriptors,
+                        const ApproxQuantileDescriptors&,
+                        const CountDistinctDescriptors,
                         const bool sort_on_gpu_hint,
                         const bool output_columnar,
                         const bool render_output,
@@ -118,7 +121,8 @@ class QueryMemoryDescriptor {
       const size_t shard_count,
       const size_t max_groups_buffer_entry_count,
       RenderInfo* render_info,
-      const CountDistinctDescriptors count_distinct_descriptors,
+      const ApproxQuantileDescriptors&,
+      const CountDistinctDescriptors,
       const bool must_use_baseline_sort,
       const bool output_columnar_hint,
       const bool streaming_top_n_hint,
@@ -263,6 +267,10 @@ class QueryMemoryDescriptor {
 
   bool hasNulls() const { return has_nulls_; }
 
+  const ApproxQuantileDescriptors& getApproxQuantileDescriptors() const {
+    return approx_quantile_descriptors_;
+  }
+
   const CountDistinctDescriptor& getCountDistinctDescriptor(const size_t idx) const {
     CHECK_LT(idx, count_distinct_descriptors_.size());
     return count_distinct_descriptors_[idx];
@@ -398,6 +406,7 @@ class QueryMemoryDescriptor {
   int64_t max_val_;
   int64_t bucket_;
   bool has_nulls_;
+  ApproxQuantileDescriptors approx_quantile_descriptors_;
   CountDistinctDescriptors count_distinct_descriptors_;
   bool sort_on_gpu_;
   bool output_columnar_;
diff --git a/QueryEngine/Descriptors/RowSetMemoryOwner.h b/QueryEngine/Descriptors/RowSetMemoryOwner.h
index b8f8fd445f..b938d5029e 100644
--- a/QueryEngine/Descriptors/RowSetMemoryOwner.h
+++ b/QueryEngine/Descriptors/RowSetMemoryOwner.h
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <boost/noncopyable.hpp>
+#include <deque>
 #include <list>
 #include <memory>
 #include <mutex>
@@ -24,8 +25,11 @@
 #include <unordered_map>
 #include <vector>
 
+#include "ApproxQuantileDescriptor.h"
 #include "DataMgr/AbstractBuffer.h"
 #include "DataMgr/Allocators/ArenaAllocator.h"
+#include "DataMgr/Allocators/CpuMgrArenaAllocator.h"
+#include "DataMgr/Allocators/FastAllocator.h"
 #include "DataMgr/DataMgr.h"
 #include "Logger/Logger.h"
 #include "QueryEngine/AggMode.h"
@@ -37,6 +41,9 @@
 #include "StringDictionary/StringDictionaryProxy.h"
 #include "StringOps/StringOps.h"
 
+extern bool g_allow_memory_status_log;
+extern bool g_use_cpu_mem_pool_for_output_buffers;
+
 namespace Catalog_Namespace {
 class Catalog;
 }
@@ -49,28 +56,73 @@ class ResultSet;
  */
 class RowSetMemoryOwner final : public SimpleAllocator, boost::noncopyable {
  public:
-  RowSetMemoryOwner(const size_t arena_block_size,
-                    const size_t executor_id,
-                    const size_t num_kernel_threads = 0)
-      : non_owned_group_by_buffers_(num_kernel_threads + 1, nullptr)
-      , arena_block_size_(arena_block_size)
-      , executor_id_(executor_id) {
-    VLOG(2) << "Prepare " << num_kernel_threads + 1
-            << " allocators from RowSetMemoryOwner attached to Executor-" << executor_id_;
-    allocators_.reserve(num_kernel_threads + 1);
-    for (size_t i = 0; i < num_kernel_threads + 1; i++) {
-      allocators_.emplace_back(std::make_unique<DramArena>(arena_block_size));
+  RowSetMemoryOwner(const size_t arena_block_size, const size_t executor_id)
+      : arena_block_size_(arena_block_size), executor_id_(executor_id) {
+    // initialize shared allocator (i.e., allocators_[0])
+    if (g_use_cpu_mem_pool_for_output_buffers) {
+      allocators_.emplace_back(std::make_unique<CpuMgrArenaAllocator>());
+    } else {
+      allocators_.emplace_back(std::make_unique<DramArena>(arena_block_size_));
     }
-    CHECK(!allocators_.empty());
+    count_distinct_buffer_allocators_.resize(allocators_.size());
   }
 
   enum class StringTranslationType { SOURCE_INTERSECTION, SOURCE_UNION };
 
-  int8_t* allocate(const size_t num_bytes, const size_t thread_idx = 0) override {
+  void setKernelMemoryAllocator(const size_t num_kernels) {
+    CHECK_GT(num_kernels, static_cast<size_t>(0));
+    CHECK_EQ(non_owned_group_by_buffers_.size(), static_cast<size_t>(0));
+    // buffer for kernels starts with one-based indexing
+    auto const required_num_kernels = num_kernels + 1;
+    non_owned_group_by_buffers_.resize(required_num_kernels, nullptr);
+    // sometimes the same RSMO instance handles multiple work units or even multiple query
+    // steps (this means the RSMO's owner, an Executor instance, takes a responsibility to
+    // proceed them) so, if the first query step has M allocators but if the second query
+    // step requires N allocators where N > M, let's allocate M - N allocators instead of
+    // recreating M new allocators
+    if (required_num_kernels > allocators_.size()) {
+      auto const required_num_allocators = required_num_kernels - allocators_.size();
+      VLOG(1) << "Prepare " << required_num_allocators
+              << " memory allocator(s) (Executor-" << executor_id_
+              << ", # existing allocator(s): " << allocators_.size()
+              << ", # requested allocator(s): " << required_num_kernels << ")";
+      for (size_t i = 0; i < required_num_allocators; i++) {
+        if (g_use_cpu_mem_pool_for_output_buffers) {
+          allocators_.emplace_back(std::make_unique<CpuMgrArenaAllocator>());
+        } else {
+          // todo (yoonmin): can we determine better default min_block_size per query?
+          allocators_.emplace_back(std::make_unique<DramArena>(arena_block_size_));
+        }
+      }
+    }
+    CHECK_GE(allocators_.size(), required_num_kernels);
+    count_distinct_buffer_allocators_.resize(allocators_.size());
+  }
+
+  // allocate memory via shared allocator
+  int8_t* allocate(const size_t num_bytes) override {
+    constexpr size_t thread_idx = 0u;
+    return allocate(num_bytes, thread_idx);
+  }
+
+  // allocate memory via thread's unique allocator
+  int8_t* allocate(const size_t num_bytes, const size_t thread_idx) {
     CHECK_LT(thread_idx, allocators_.size());
-    auto allocator = allocators_[thread_idx].get();
     std::lock_guard<std::mutex> lock(state_mutex_);
-    return reinterpret_cast<int8_t*>(allocator->allocate(num_bytes));
+    return allocateUnlocked(num_bytes, thread_idx);
+  }
+
+  void initCountDistinctBufferAllocator(size_t buffer_size, size_t thread_idx) {
+    std::lock_guard<std::mutex> lock(state_mutex_);
+    VLOG(2) << "Count distinct buffer allocator initialized with buffer_size: "
+            << buffer_size << ", thread_idx: " << thread_idx;
+    CHECK_LT(thread_idx, count_distinct_buffer_allocators_.size());
+    if (count_distinct_buffer_allocators_[thread_idx]) {
+      VLOG(2) << "Replacing count_distinct_buffer_allocators_[" << thread_idx << "].";
+    }
+    count_distinct_buffer_allocators_[thread_idx] =
+        std::make_unique<CountDistinctBufferAllocator>(
+            allocateUnlocked(buffer_size, thread_idx), buffer_size);
   }
 
   std::pair<int64_t*, bool> allocateCachedGroupByBuffer(const size_t num_bytes,
@@ -83,6 +135,10 @@ class RowSetMemoryOwner final : public SimpleAllocator, boost::noncopyable {
     }
     // Was not in cache so must allocate
     auto allocator = allocators_[thread_idx].get();
+    if (g_allow_memory_status_log) {
+      VLOG(1) << "Try to allocate CPU memory: " << num_bytes << " bytes (THREAD-"
+              << thread_idx << ")";
+    }
     int64_t* group_by_buffer = reinterpret_cast<int64_t*>(allocator->allocate(num_bytes));
     CHECK(group_by_buffer);
     // Put in cache
@@ -92,7 +148,9 @@ class RowSetMemoryOwner final : public SimpleAllocator, boost::noncopyable {
 
   int8_t* allocateCountDistinctBuffer(const size_t num_bytes,
                                       const size_t thread_idx = 0) {
-    int8_t* buffer = allocate(num_bytes, thread_idx);
+    CHECK_LT(thread_idx, count_distinct_buffer_allocators_.size());
+    CHECK(count_distinct_buffer_allocators_[thread_idx]);
+    int8_t* buffer = count_distinct_buffer_allocators_[thread_idx]->allocate(num_bytes);
     std::memset(buffer, 0, num_bytes);
     addCountDistinctBuffer(buffer, num_bytes, /*physical_buffer=*/true);
     return buffer;
@@ -111,9 +169,7 @@ class RowSetMemoryOwner final : public SimpleAllocator, boost::noncopyable {
     count_distinct_sets_.push_back(count_distinct_set);
   }
 
-  void clearNonOwnedGroupByBuffers() {
-    non_owned_group_by_buffers_.assign(non_owned_group_by_buffers_.size(), nullptr);
-  }
+  void clearNonOwnedGroupByBuffers() { non_owned_group_by_buffers_.clear(); }
 
   void addVarlenBuffer(void* varlen_buffer) {
     std::lock_guard<std::mutex> lock(state_mutex_);
@@ -303,7 +359,8 @@ class RowSetMemoryOwner final : public SimpleAllocator, boost::noncopyable {
       ++allocator_id;
     }
     oss << "}";
-    VLOG(2) << oss.str();
+    allocators_.clear();
+    VLOG(1) << oss.str();
     for (auto count_distinct_set : count_distinct_sets_) {
       delete count_distinct_set;
     }
@@ -320,8 +377,7 @@ class RowSetMemoryOwner final : public SimpleAllocator, boost::noncopyable {
   }
 
   std::shared_ptr<RowSetMemoryOwner> cloneStrDictDataOnly() {
-    auto rtn = std::make_shared<RowSetMemoryOwner>(
-        arena_block_size_, executor_id_, /*num_kernels=*/1);
+    auto rtn = std::make_shared<RowSetMemoryOwner>(arena_block_size_, executor_id_);
     rtn->str_dict_proxy_owned_ = str_dict_proxy_owned_;
     rtn->lit_str_dict_proxy_ = lit_str_dict_proxy_;
     return rtn;
@@ -335,7 +391,8 @@ class RowSetMemoryOwner final : public SimpleAllocator, boost::noncopyable {
     return string_dictionary_generations_;
   }
 
-  quantile::TDigest* nullTDigest(double const q);
+  quantile::TDigest* initTDigest(size_t thread_idx, ApproxQuantileDescriptor, double q);
+  void reserveTDigestMemory(size_t thread_idx, size_t capacity);
 
   //
   // key/value store for table function intercommunication
@@ -372,6 +429,15 @@ class RowSetMemoryOwner final : public SimpleAllocator, boost::noncopyable {
   }
 
  private:
+  int8_t* allocateUnlocked(const size_t num_bytes, const size_t thread_idx) {
+    if (g_allow_memory_status_log) {
+      VLOG(1) << "Try to allocate CPU memory: " << num_bytes << " bytes (THREAD-"
+              << thread_idx << ")";
+    }
+    auto allocator = allocators_[thread_idx].get();
+    return reinterpret_cast<int8_t*>(allocator->allocate(num_bytes));
+  }
+
   struct CountDistinctBitmapBuffer {
     int8_t* ptr;
     const size_t size;
@@ -396,13 +462,22 @@ class RowSetMemoryOwner final : public SimpleAllocator, boost::noncopyable {
   StringDictionaryGenerations string_dictionary_generations_;
   std::vector<void*> col_buffers_;
   std::vector<Data_Namespace::AbstractBuffer*> varlen_input_buffers_;
+
+  using TDigestAllocator = FastAllocator<int8_t>;
+  std::deque<TDigestAllocator> t_digest_allocators_;
   std::vector<std::unique_ptr<quantile::TDigest>> t_digests_;
+
   std::map<std::string, std::shared_ptr<StringOps_Namespace::StringOps>>
       string_ops_owned_;
   std::list<AggMode> mode_maps_;
 
   size_t arena_block_size_;  // for cloning
   std::vector<std::unique_ptr<Arena>> allocators_;
+
+  using CountDistinctBufferAllocator = FastAllocator<int8_t>;
+  std::vector<std::unique_ptr<CountDistinctBufferAllocator>>
+      count_distinct_buffer_allocators_;
+
   size_t executor_id_;
 
   mutable std::mutex state_mutex_;
diff --git a/QueryEngine/Descriptors/Types.h b/QueryEngine/Descriptors/Types.h
deleted file mode 100644
index 1c18dc5f21..0000000000
--- a/QueryEngine/Descriptors/Types.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright 2022 HEAVY.AI, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * @file    Types.h
- * @brief   Catch-all for publicly accessible types utilized in various Query Engine
- * Descriptors
- *
- */
-
-#pragma once
-
-#include <ostream>
-#include <sstream>
-
-enum class QueryDescriptionType {
-  GroupByPerfectHash,
-  GroupByBaselineHash,
-  Projection,
-  TableFunction,
-  NonGroupedAggregate,
-  Estimator
-};
-
-inline std::ostream& operator<<(std::ostream& os, const QueryDescriptionType& type) {
-  switch (type) {
-    case QueryDescriptionType::GroupByPerfectHash:
-      os << "GroupByPerfectHash";
-      break;
-    case QueryDescriptionType::GroupByBaselineHash:
-      os << "GroupByBaselineHash";
-      break;
-    case QueryDescriptionType::Projection:
-      os << "Projection";
-      break;
-    case QueryDescriptionType::TableFunction:
-      os << "TableFunction";
-      break;
-    case QueryDescriptionType::NonGroupedAggregate:
-      os << "NonGroupedAggregate";
-      break;
-    case QueryDescriptionType::Estimator:
-      os << "Estimator";
-      break;
-    default:
-      os << "Unknown QueryDescriptionType";
-  }
-  return os;
-}
-
-inline std::string toString(const QueryDescriptionType& type) {
-  std::ostringstream ss;
-  ss << type;
-  return ss.str();
-}
diff --git a/QueryEngine/ErrorHandling.h b/QueryEngine/ErrorHandling.h
index 2b9dc5b365..421a67c553 100644
--- a/QueryEngine/ErrorHandling.h
+++ b/QueryEngine/ErrorHandling.h
@@ -16,50 +16,70 @@
 
 #pragma once
 
+#include "enums.h"
+
 #include <stdexcept>
 
-#include "Descriptors/Types.h"
+using heavyai::ErrorCode;
 
 struct QueryExecutionProperties {
-  QueryDescriptionType query_type;
+  heavyai::QueryDescriptionType query_type;
   bool was_multifrag_kernel_launch;
 };
 
 class QueryExecutionError : public std::runtime_error {
  public:
-  QueryExecutionError(const int32_t error_code, const std::string& e)
-      : std::runtime_error("Query execution failed with error code " +
-                           std::to_string(error_code) + "\n" + e)
-      , error_code_(error_code) {}
+  QueryExecutionError(const ErrorCode error_code, const std::string& e)
+      : std::runtime_error(std::string("Query execution failed with error code ") +
+                           to_string(error_code) + "\n" + e)
+      , error_code_(static_cast<int32_t>(error_code)) {}
 
-  QueryExecutionError(const int32_t error_code,
+  QueryExecutionError(const ErrorCode error_code,
                       const std::string& e,
                       const QueryExecutionProperties& execution_properties)
-      : std::runtime_error("Query execution failed with error code " +
-                           std::to_string(error_code) + "\n" + e)
-      , error_code_(error_code)
+      : std::runtime_error(std::string("Query execution failed with error code ") +
+                           to_string(error_code) + "\n" + e)
+      , error_code_(static_cast<int32_t>(error_code))
       , execution_props_(execution_properties) {}
 
-  QueryExecutionError(const int32_t error_code,
+  QueryExecutionError(const ErrorCode error_code,
                       const QueryExecutionProperties& execution_properties)
-      : std::runtime_error("Query execution failed with error code " +
-                           std::to_string(error_code))
-      , error_code_(error_code)
+      : std::runtime_error(std::string("Query execution failed with error code ") +
+                           to_string(error_code))
+      , error_code_(static_cast<int32_t>(error_code))
       , execution_props_(execution_properties) {}
 
+  QueryExecutionError(const ErrorCode error_code)
+      : std::runtime_error(std::string("Query execution failed with error code ") +
+                           to_string(error_code))
+      , error_code_(static_cast<int32_t>(error_code)) {}
+
+  // Given error_code may not be in range of enum class ErrorCode.
   QueryExecutionError(const int32_t error_code)
       : std::runtime_error("Query execution failed with error code " +
-                           std::to_string(error_code))
+                           QueryExecutionError::toString(error_code))
       , error_code_(error_code) {}
 
   int32_t getErrorCode() const { return error_code_; }
 
+  bool hasErrorCode(ErrorCode const ec) const {
+    return error_code_ == static_cast<int32_t>(ec);
+  }
+
+  inline static std::string toString(int32_t error_code) {
+    if (size_t(error_code) < size_t(ErrorCode::N_)) {
+      return to_string(static_cast<ErrorCode>(error_code));
+    } else {
+      return std::to_string(error_code);
+    }
+  }
+
   bool wasMultifragKernelLaunch() const {
     return execution_props_ && (*execution_props_).was_multifrag_kernel_launch;
   }
 
  protected:
-  int32_t error_code_;
+  int32_t error_code_;  // May be out-of-range of enum class ErrorCode values.
   boost::optional<QueryExecutionProperties> execution_props_;
 };
 
diff --git a/QueryEngine/Execute.cpp b/QueryEngine/Execute.cpp
index d1c3712537..206c60ebab 100644
--- a/QueryEngine/Execute.cpp
+++ b/QueryEngine/Execute.cpp
@@ -26,12 +26,12 @@
 #include <chrono>
 #include <ctime>
 #include <future>
-#include <iostream>
 #include <memory>
 #include <mutex>
 #include <numeric>
 #include <set>
 #include <thread>
+#include <type_traits>
 
 #include "Catalog/Catalog.h"
 #include "CudaMgr/CudaMgr.h"
@@ -82,6 +82,10 @@ bool g_enable_dynamic_watchdog{false};
 size_t g_watchdog_none_encoded_string_translation_limit{1000000UL};
 size_t g_watchdog_max_projected_rows_per_device{128000000};
 size_t g_preflight_count_query_threshold{1000000};
+size_t g_watchdog_in_clause_max_num_elem_non_bitmap{10000};
+size_t g_watchdog_in_clause_max_num_elem_bitmap{1 << 25};
+size_t g_watchdog_in_clause_max_num_input_rows{5000000};
+size_t g_in_clause_num_elem_skip_bitmap{100};
 bool g_enable_cpu_sub_tasks{false};
 size_t g_cpu_sub_task_size{500'000};
 bool g_enable_filter_function{true};
@@ -190,13 +194,13 @@ bool g_executor_resource_mgr_allow_cpu_slot_oversubscription_concurrency{false};
 bool g_executor_resource_mgr_allow_cpu_result_mem_oversubscription_concurrency{false};
 double g_executor_resource_mgr_max_available_resource_use_ratio{0.8};
 
+bool g_use_cpu_mem_pool_for_output_buffers{false};
+
 extern bool g_cache_string_hash;
 extern bool g_allow_memory_status_log;
 
 int const Executor::max_gpu_count;
 
-const int32_t Executor::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES;
-
 std::map<Executor::ExtModuleKinds, std::string> Executor::extension_module_sources;
 
 extern std::unique_ptr<llvm::Module> read_llvm_module_from_bc_file(
@@ -670,12 +674,33 @@ RowSetMemoryOwner::getOrAddStringProxyNumericTranslationMap(
   return addStringProxyNumericTranslationMap(source_proxy, string_op_infos);
 }
 
-quantile::TDigest* RowSetMemoryOwner::nullTDigest(double const q) {
+quantile::TDigest* RowSetMemoryOwner::initTDigest(size_t const thread_idx,
+                                                  ApproxQuantileDescriptor const desc,
+                                                  double const q) {
+  static_assert(std::is_trivially_copyable_v<ApproxQuantileDescriptor>);
   std::lock_guard<std::mutex> lock(state_mutex_);
-  return t_digests_
-      .emplace_back(std::make_unique<quantile::TDigest>(
-          q, this, g_approx_quantile_buffer, g_approx_quantile_centroids))
-      .get();
+  auto t_digest = std::make_unique<quantile::TDigest>(
+      q, &t_digest_allocators_[thread_idx], desc.buffer_size, desc.centroids_size);
+  return t_digests_.emplace_back(std::move(t_digest)).get();
+}
+
+void RowSetMemoryOwner::reserveTDigestMemory(size_t thread_idx, size_t capacity) {
+  std::unique_lock<std::mutex> lock(state_mutex_);
+  if (t_digest_allocators_.size() <= thread_idx) {
+    t_digest_allocators_.resize(thread_idx + 1u);
+  }
+  if (t_digest_allocators_[thread_idx].capacity()) {
+    // This can only happen when a thread_idx is re-used.  In other words,
+    // two or more kernels have launched (serially!) using the same thread_idx.
+    // This is ok since TDigestAllocator does not own the memory it allocates.
+    VLOG(2) << "Replacing t_digest_allocators_[" << thread_idx << "].";
+  }
+  lock.unlock();
+  // This is not locked due to use of same state_mutex_ during allocation.
+  // The corresponding deallocation happens in ~DramArena().
+  int8_t* const buffer = allocate(capacity, thread_idx);
+  lock.lock();
+  t_digest_allocators_[thread_idx] = TDigestAllocator(buffer, capacity);
 }
 
 bool Executor::isCPUOnly() const {
@@ -1483,7 +1508,7 @@ std::pair<int64_t, int32_t> Executor::reduceResults(const SQLAgg agg,
           if (agg_result == agg_init_val) {
             agg_result = out_vec[i];
           } else if (out_vec[i] != agg_result) {
-            return {agg_result, Executor::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES};
+            return {agg_result, int32_t(ErrorCode::SINGLE_VALUE_FOUND_MULTIPLE_VALUES)};
           }
         }
       }
@@ -2084,6 +2109,7 @@ ResultSetPtr Executor::executeWorkUnit(size_t& max_groups_buffer_entry_guess,
   ScopeGuard cleanup_post_execution = [this] {
     // cleanup/unpin GPU buffer allocations
     // TODO: separate out this state into a single object
+    VLOG(1) << "Perform post execution clearance for Executor " << executor_id_;
     plan_state_.reset(nullptr);
     if (cgen_state_) {
       cgen_state_->in_values_bitmaps_.clear();
@@ -2239,6 +2265,9 @@ ResultSetPtr Executor::executeWorkUnitImpl(
                                      render_info,
                                      available_gpus,
                                      available_cpus);
+        if (!kernels.empty()) {
+          row_set_mem_owner_->setKernelMemoryAllocator(kernels.size());
+        }
         if (g_enable_executor_resource_mgr) {
           launchKernelsViaResourceMgr(shared_context,
                                       std::move(kernels),
@@ -2252,13 +2281,13 @@ ResultSetPtr Executor::executeWorkUnitImpl(
 
       } catch (QueryExecutionError& e) {
         if (eo.with_dynamic_watchdog && interrupted_.load() &&
-            e.getErrorCode() == ERR_OUT_OF_TIME) {
-          throw QueryExecutionError(ERR_INTERRUPTED);
+            e.hasErrorCode(ErrorCode::OUT_OF_TIME)) {
+          throw QueryExecutionError(ErrorCode::INTERRUPTED);
         }
-        if (e.getErrorCode() == ERR_INTERRUPTED) {
-          throw QueryExecutionError(ERR_INTERRUPTED);
+        if (e.hasErrorCode(ErrorCode::INTERRUPTED)) {
+          throw QueryExecutionError(ErrorCode::INTERRUPTED);
         }
-        if (e.getErrorCode() == ERR_OVERFLOW_OR_UNDERFLOW &&
+        if (e.hasErrorCode(ErrorCode::OVERFLOW_OR_UNDERFLOW) &&
             static_cast<size_t>(crt_min_byte_width << 1) <= sizeof(int64_t)) {
           crt_min_byte_width <<= 1;
           continue;
@@ -2311,7 +2340,7 @@ ResultSetPtr Executor::executeWorkUnitImpl(
                                        query_comp_desc_owned->getDeviceType(),
                                        row_set_mem_owner);
       } catch (ReductionRanOutOfSlots&) {
-        throw QueryExecutionError(ERR_OUT_OF_SLOTS);
+        throw QueryExecutionError(ErrorCode::OUT_OF_SLOTS);
       } catch (OverflowOrUnderflow&) {
         crt_min_byte_width <<= 1;
         continue;
@@ -2601,9 +2630,12 @@ void fill_entries_for_empty_input(std::vector<TargetInfo>& target_infos,
           query_mem_desc.getCountDistinctDescriptor(target_idx);
       if (count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap) {
         CHECK(row_set_mem_owner);
-        auto count_distinct_buffer = row_set_mem_owner->allocateCountDistinctBuffer(
-            count_distinct_desc.bitmapPaddedSizeBytes(),
-            /*thread_idx=*/0);  // TODO: can we detect thread idx here?
+        // TODO: can we detect thread idx here?
+        constexpr size_t thread_idx{0};
+        const auto bitmap_size = count_distinct_desc.bitmapPaddedSizeBytes();
+        row_set_mem_owner->initCountDistinctBufferAllocator(bitmap_size, thread_idx);
+        auto count_distinct_buffer =
+            row_set_mem_owner->allocateCountDistinctBuffer(bitmap_size, thread_idx);
         entry.push_back(reinterpret_cast<int64_t>(count_distinct_buffer));
         continue;
       }
@@ -2664,6 +2696,7 @@ ResultSetPtr build_row_for_empty_input(
   fill_entries_for_empty_input(target_infos, entry, target_exprs, query_mem_desc);
   const auto executor = query_mem_desc.getExecutor();
   CHECK(executor);
+  // todo(yoonmin): Can we avoid initialize DramArena for this empty result case?
   auto row_set_mem_owner = executor->getRowSetMemoryOwner();
   CHECK(row_set_mem_owner);
   auto rs = std::make_shared<ResultSet>(target_infos,
@@ -3001,7 +3034,6 @@ std::vector<std::unique_ptr<ExecutionKernel>> Executor::createKernels(
     fragment_descriptor.assignFragsToKernelDispatch(fragment_per_kernel_dispatch,
                                                     ra_exe_unit);
   }
-
   return execution_kernels;
 }
 
@@ -3465,11 +3497,11 @@ FetchResult Executor::fetchChunks(
               checkIsQuerySessionInterrupted(query_session, session_read_lock);
         }
         if (isInterrupted) {
-          throw QueryExecutionError(ERR_INTERRUPTED);
+          throw QueryExecutionError(ErrorCode::INTERRUPTED);
         }
       }
       if (g_enable_dynamic_watchdog && interrupted_.load()) {
-        throw QueryExecutionError(ERR_INTERRUPTED);
+        throw QueryExecutionError(ErrorCode::INTERRUPTED);
       }
       CHECK(col_id);
       const auto cd = try_get_column_descriptor(col_id.get());
@@ -3659,7 +3691,7 @@ FetchResult Executor::fetchUnionChunks(
       isInterrupted = checkIsQuerySessionInterrupted(query_session, session_read_lock);
     }
     if (isInterrupted) {
-      throw QueryExecutionError(ERR_INTERRUPTED);
+      throw QueryExecutionError(ErrorCode::INTERRUPTED);
     }
   }
   std::vector<const int8_t*> frag_col_buffers(
@@ -3849,11 +3881,11 @@ int32_t Executor::executePlanWithoutGroupBy(
       isInterrupted = checkIsQuerySessionInterrupted(query_session, session_read_lock);
     }
     if (isInterrupted) {
-      throw QueryExecutionError(ERR_INTERRUPTED);
+      throw QueryExecutionError(ErrorCode::INTERRUPTED);
     }
   }
   if (g_enable_dynamic_watchdog && interrupted_.load()) {
-    throw QueryExecutionError(ERR_INTERRUPTED);
+    throw QueryExecutionError(ErrorCode::INTERRUPTED);
   }
   if (device_type == ExecutorDeviceType::CPU) {
     CpuCompilationContext* cpu_generated_code =
@@ -3900,18 +3932,19 @@ int32_t Executor::executePlanWithoutGroupBy(
           optimize_cuda_block_and_grid_sizes);
       output_memory_scope.reset(new OutVecOwner(out_vec));
     } catch (const OutOfMemory&) {
-      return ERR_OUT_OF_GPU_MEM;
+      return int32_t(ErrorCode::OUT_OF_GPU_MEM);
     } catch (const std::exception& e) {
       LOG(FATAL) << "Error launching the GPU kernel: " << e.what();
     }
   }
-  if (error_code == Executor::ERR_OVERFLOW_OR_UNDERFLOW ||
-      error_code == Executor::ERR_DIV_BY_ZERO ||
-      error_code == Executor::ERR_OUT_OF_TIME ||
-      error_code == Executor::ERR_INTERRUPTED ||
-      error_code == Executor::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES ||
-      error_code == Executor::ERR_GEOS ||
-      error_code == Executor::ERR_WIDTH_BUCKET_INVALID_ARGUMENT) {
+  if (heavyai::IsAny<ErrorCode::OVERFLOW_OR_UNDERFLOW,
+                     ErrorCode::DIV_BY_ZERO,
+                     ErrorCode::OUT_OF_TIME,
+                     ErrorCode::INTERRUPTED,
+                     ErrorCode::SINGLE_VALUE_FOUND_MULTIPLE_VALUES,
+                     ErrorCode::GEOS,
+                     ErrorCode::WIDTH_BUCKET_INVALID_ARGUMENT,
+                     ErrorCode::BBOX_OVERLAPS_LIMIT_EXCEEDED>::check(error_code)) {
     return error_code;
   }
   if (ra_exe_unit.estimator) {
@@ -4070,11 +4103,11 @@ int32_t Executor::executePlanWithGroupBy(
       isInterrupted = checkIsQuerySessionInterrupted(query_session, session_read_lock);
     }
     if (isInterrupted) {
-      throw QueryExecutionError(ERR_INTERRUPTED);
+      throw QueryExecutionError(ErrorCode::INTERRUPTED);
     }
   }
   if (g_enable_dynamic_watchdog && interrupted_.load()) {
-    return ERR_INTERRUPTED;
+    return int32_t(ErrorCode::INTERRUPTED);
   }
 
   RenderAllocatorMap* render_allocator_map_ptr = nullptr;
@@ -4169,28 +4202,29 @@ int32_t Executor::executePlanWithGroupBy(
           render_allocator_map_ptr,
           optimize_cuda_block_and_grid_sizes);
     } catch (const OutOfMemory&) {
-      return ERR_OUT_OF_GPU_MEM;
+      return int32_t(ErrorCode::OUT_OF_GPU_MEM);
     } catch (const OutOfRenderMemory&) {
-      return ERR_OUT_OF_RENDER_MEM;
+      return int32_t(ErrorCode::OUT_OF_RENDER_MEM);
     } catch (const StreamingTopNNotSupportedInRenderQuery&) {
-      return ERR_STREAMING_TOP_N_NOT_SUPPORTED_IN_RENDER_QUERY;
+      return int32_t(ErrorCode::STREAMING_TOP_N_NOT_SUPPORTED_IN_RENDER_QUERY);
     } catch (const std::exception& e) {
       LOG(FATAL) << "Error launching the GPU kernel: " << e.what();
     }
   }
 
-  if (error_code == Executor::ERR_OVERFLOW_OR_UNDERFLOW ||
-      error_code == Executor::ERR_DIV_BY_ZERO ||
-      error_code == Executor::ERR_OUT_OF_TIME ||
-      error_code == Executor::ERR_INTERRUPTED ||
-      error_code == Executor::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES ||
-      error_code == Executor::ERR_GEOS ||
-      error_code == Executor::ERR_WIDTH_BUCKET_INVALID_ARGUMENT) {
+  if (heavyai::IsAny<ErrorCode::OVERFLOW_OR_UNDERFLOW,
+                     ErrorCode::DIV_BY_ZERO,
+                     ErrorCode::OUT_OF_TIME,
+                     ErrorCode::INTERRUPTED,
+                     ErrorCode::SINGLE_VALUE_FOUND_MULTIPLE_VALUES,
+                     ErrorCode::GEOS,
+                     ErrorCode::WIDTH_BUCKET_INVALID_ARGUMENT,
+                     ErrorCode::BBOX_OVERLAPS_LIMIT_EXCEEDED>::check(error_code)) {
     return error_code;
   }
 
-  if (results && error_code != Executor::ERR_OVERFLOW_OR_UNDERFLOW &&
-      error_code != Executor::ERR_DIV_BY_ZERO && !render_allocator_map_ptr) {
+  if (results && error_code != int32_t(ErrorCode::OVERFLOW_OR_UNDERFLOW) &&
+      error_code != int32_t(ErrorCode::DIV_BY_ZERO) && !render_allocator_map_ptr) {
     *results = query_exe_context->getRowSet(ra_exe_unit_copy,
                                             query_exe_context->query_mem_desc_);
     CHECK(*results);
@@ -4287,7 +4321,7 @@ Executor::JoinHashTableOrError Executor::buildHashTableForQualifier(
             "Bounding box intersection disabled, attempting to fall back to loop join"};
   }
   if (g_enable_dynamic_watchdog && interrupted_.load()) {
-    throw QueryExecutionError(ERR_INTERRUPTED);
+    throw QueryExecutionError(ErrorCode::INTERRUPTED);
   }
   try {
     auto tbl = HashJoin::getInstance(qual_bin_oper,
@@ -4925,8 +4959,8 @@ TableGenerations Executor::computeTableGenerations(
 
 void Executor::setupCaching(const std::unordered_set<PhysicalInput>& phys_inputs,
                             const std::unordered_set<shared::TableKey>& phys_table_ids) {
-  row_set_mem_owner_ = std::make_shared<RowSetMemoryOwner>(
-      Executor::getArenaBlockSize(), executor_id_, cpu_threads());
+  row_set_mem_owner_ =
+      std::make_shared<RowSetMemoryOwner>(Executor::getArenaBlockSize(), executor_id_);
   row_set_mem_owner_->setDictionaryGenerations(
       computeStringDictionaryGenerations(phys_inputs));
   agg_col_range_cache_ = computeColRangesCache(phys_inputs);
@@ -5020,7 +5054,7 @@ void Executor::checkPendingQueryStatus(const QuerySessionId& query_session) {
     return;
   }
   if (queries_interrupt_flag_[query_session]) {
-    throw QueryExecutionError(Executor::ERR_INTERRUPTED);
+    throw QueryExecutionError(ErrorCode::INTERRUPTED);
   }
 }
 
diff --git a/QueryEngine/Execute.h b/QueryEngine/Execute.h
index 2496687d24..dc54893c66 100644
--- a/QueryEngine/Execute.h
+++ b/QueryEngine/Execute.h
@@ -1612,23 +1612,6 @@ class Executor {
   static QueryPlanDAG latest_query_plan_extracted_;
 
  public:
-  static const int32_t ERR_DIV_BY_ZERO{1};
-  static const int32_t ERR_OUT_OF_GPU_MEM{2};
-  static const int32_t ERR_OUT_OF_SLOTS{3};
-  static const int32_t ERR_UNSUPPORTED_SELF_JOIN{4};
-  static const int32_t ERR_OUT_OF_RENDER_MEM{5};
-  static const int32_t ERR_OUT_OF_CPU_MEM{6};
-  static const int32_t ERR_OVERFLOW_OR_UNDERFLOW{7};
-  static const int32_t ERR_OUT_OF_TIME{9};
-  static const int32_t ERR_INTERRUPTED{10};
-  static const int32_t ERR_COLUMNAR_CONVERSION_NOT_SUPPORTED{11};
-  static const int32_t ERR_TOO_MANY_LITERALS{12};
-  static const int32_t ERR_STRING_CONST_IN_RESULTSET{13};
-  static const int32_t ERR_STREAMING_TOP_N_NOT_SUPPORTED_IN_RENDER_QUERY{14};
-  static const int32_t ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES{15};
-  static const int32_t ERR_GEOS{16};
-  static const int32_t ERR_WIDTH_BUCKET_INVALID_ARGUMENT{17};
-
   // Although compilation is Executor-local, an executor may trigger
   // threaded compilations (see executeWorkUnitPerFragment) that share
   // executor cgen_state and LLVM context, for instance.
diff --git a/QueryEngine/ExecuteUpdate.cpp b/QueryEngine/ExecuteUpdate.cpp
index b1e6604b00..9f7327dcd3 100644
--- a/QueryEngine/ExecuteUpdate.cpp
+++ b/QueryEngine/ExecuteUpdate.cpp
@@ -158,7 +158,6 @@ TableUpdateMetadata Executor::executeUpdate(
       continue;
     }
     fragments[0] = {outer_table_key, {fragment_index}};
-
     {
       ExecutionKernel current_fragment_kernel(ra_exe_unit,
                                               ExecutorDeviceType::CPU,
diff --git a/QueryEngine/ExecutionKernel.cpp b/QueryEngine/ExecutionKernel.cpp
index 578f0b8ef1..0fc28cc47e 100644
--- a/QueryEngine/ExecutionKernel.cpp
+++ b/QueryEngine/ExecutionKernel.cpp
@@ -134,24 +134,24 @@ void ExecutionKernel::run(Executor* executor,
   try {
     runImpl(executor, thread_idx, shared_context);
   } catch (const OutOfHostMemory& e) {
-    throw QueryExecutionError(Executor::ERR_OUT_OF_CPU_MEM, e.what());
+    throw QueryExecutionError(ErrorCode::OUT_OF_CPU_MEM, e.what());
   } catch (const std::bad_alloc& e) {
-    throw QueryExecutionError(Executor::ERR_OUT_OF_CPU_MEM, e.what());
+    throw QueryExecutionError(ErrorCode::OUT_OF_CPU_MEM, e.what());
   } catch (const OutOfRenderMemory& e) {
-    throw QueryExecutionError(Executor::ERR_OUT_OF_RENDER_MEM, e.what());
+    throw QueryExecutionError(ErrorCode::OUT_OF_RENDER_MEM, e.what());
   } catch (const OutOfMemory& e) {
     throw QueryExecutionError(
-        Executor::ERR_OUT_OF_GPU_MEM,
+        ErrorCode::OUT_OF_GPU_MEM,
         e.what(),
         QueryExecutionProperties{
             query_mem_desc.getQueryDescriptionType(),
             kernel_dispatch_mode == ExecutorDispatchMode::MultifragmentKernel});
   } catch (const ColumnarConversionNotSupported& e) {
-    throw QueryExecutionError(Executor::ERR_COLUMNAR_CONVERSION_NOT_SUPPORTED, e.what());
+    throw QueryExecutionError(ErrorCode::COLUMNAR_CONVERSION_NOT_SUPPORTED, e.what());
   } catch (const TooManyLiterals& e) {
-    throw QueryExecutionError(Executor::ERR_TOO_MANY_LITERALS, e.what());
+    throw QueryExecutionError(ErrorCode::TOO_MANY_LITERALS, e.what());
   } catch (const StringConstInResultSet& e) {
-    throw QueryExecutionError(Executor::ERR_STRING_CONST_IN_RESULTSET, e.what());
+    throw QueryExecutionError(ErrorCode::STRING_CONST_IN_RESULTSET, e.what());
   } catch (const QueryExecutionError& e) {
     throw e;
   }
@@ -257,8 +257,8 @@ void ExecutionKernel::runImpl(Executor* executor,
     }
   } catch (const OutOfMemory&) {
     throw QueryExecutionError(
-        memory_level == Data_Namespace::GPU_LEVEL ? Executor::ERR_OUT_OF_GPU_MEM
-                                                  : Executor::ERR_OUT_OF_CPU_MEM,
+        memory_level == Data_Namespace::GPU_LEVEL ? ErrorCode::OUT_OF_GPU_MEM
+                                                  : ErrorCode::OUT_OF_CPU_MEM,
         QueryExecutionProperties{
             query_mem_desc.getQueryDescriptionType(),
             kernel_dispatch_mode == ExecutorDispatchMode::MultifragmentKernel});
@@ -400,7 +400,7 @@ void ExecutionKernel::runImpl(Executor* executor,
                                                   thread_idx,
                                                   do_render ? render_info_ : nullptr);
     } catch (const OutOfHostMemory& e) {
-      throw QueryExecutionError(Executor::ERR_OUT_OF_CPU_MEM);
+      throw QueryExecutionError(ErrorCode::OUT_OF_CPU_MEM);
     }
   }
   QueryExecutionContext* query_exe_context{query_exe_context_owned.get()};
@@ -486,24 +486,24 @@ void KernelSubtask::run(Executor* executor) {
   try {
     runImpl(executor);
   } catch (const OutOfHostMemory& e) {
-    throw QueryExecutionError(Executor::ERR_OUT_OF_CPU_MEM, e.what());
+    throw QueryExecutionError(ErrorCode::OUT_OF_CPU_MEM, e.what());
   } catch (const std::bad_alloc& e) {
-    throw QueryExecutionError(Executor::ERR_OUT_OF_CPU_MEM, e.what());
+    throw QueryExecutionError(ErrorCode::OUT_OF_CPU_MEM, e.what());
   } catch (const OutOfRenderMemory& e) {
-    throw QueryExecutionError(Executor::ERR_OUT_OF_RENDER_MEM, e.what());
+    throw QueryExecutionError(ErrorCode::OUT_OF_RENDER_MEM, e.what());
   } catch (const OutOfMemory& e) {
     throw QueryExecutionError(
-        Executor::ERR_OUT_OF_GPU_MEM,
+        ErrorCode::OUT_OF_GPU_MEM,
         e.what(),
         QueryExecutionProperties{
             kernel_.query_mem_desc.getQueryDescriptionType(),
             kernel_.kernel_dispatch_mode == ExecutorDispatchMode::MultifragmentKernel});
   } catch (const ColumnarConversionNotSupported& e) {
-    throw QueryExecutionError(Executor::ERR_COLUMNAR_CONVERSION_NOT_SUPPORTED, e.what());
+    throw QueryExecutionError(ErrorCode::COLUMNAR_CONVERSION_NOT_SUPPORTED, e.what());
   } catch (const TooManyLiterals& e) {
-    throw QueryExecutionError(Executor::ERR_TOO_MANY_LITERALS, e.what());
+    throw QueryExecutionError(ErrorCode::TOO_MANY_LITERALS, e.what());
   } catch (const StringConstInResultSet& e) {
-    throw QueryExecutionError(Executor::ERR_STRING_CONST_IN_RESULTSET, e.what());
+    throw QueryExecutionError(ErrorCode::STRING_CONST_IN_RESULTSET, e.what());
   } catch (const QueryExecutionError& e) {
     throw e;
   }
@@ -545,7 +545,7 @@ void KernelSubtask::runImpl(Executor* executor) {
           thread_idx_,
           do_render ? kernel_.render_info_ : nullptr);
     } catch (const OutOfHostMemory& e) {
-      throw QueryExecutionError(Executor::ERR_OUT_OF_CPU_MEM);
+      throw QueryExecutionError(ErrorCode::OUT_OF_CPU_MEM);
     }
   }
 
diff --git a/QueryEngine/ExpressionRewrite.h b/QueryEngine/ExpressionRewrite.h
index d9a1751e38..99727ea113 100644
--- a/QueryEngine/ExpressionRewrite.h
+++ b/QueryEngine/ExpressionRewrite.h
@@ -161,6 +161,21 @@ struct BoundingBoxIntersectJoinSupportedFunction {
       ST_DISTANCE_sv,
       ST_DWITHIN_POINT_POINT_sv};
 
+  static constexpr std::array<std::string_view, 4>
+      ST_CONTAIN_FORCE_TABLE_REORDERING_TARGET_FUNC{ST_CONTAINS_POLYGON_POINT_sv,
+                                                    ST_CONTAINS_MULTIPOLYGON_POINT_sv,
+                                                    ST_CCONTAINS_POLYGON_POINT_sv,
+                                                    ST_CCONTAINS_MULTIPOLYGON_POINT_sv};
+
+  static constexpr std::array<std::string_view, 6>
+      ST_INTERSECTS_FORCE_TABLE_REORDERING_TARGET_FUNC{
+          ST_INTERSECTS_POINT_POLYGON_sv,
+          ST_INTERSECTS_POINT_MULTIPOLYGON_sv,
+          ST_INTERSECTS_POLYGON_POINT_sv,
+          ST_INTERSECTS_MULTIPOLYGON_POINT_sv,
+          ST_CINTERSECTS_POLYGON_POINT_sv,
+          ST_CINTERSECTS_MULTIPOLYGON_POINT_sv};
+
   static bool is_bbox_intersect_supported_func(std::string_view target_func_name) {
     return std::any_of(
         BoundingBoxIntersectJoinSupportedFunction::BBOX_INTERSECT_SUPPORTED_FUNC.begin(),
diff --git a/QueryEngine/FromTableReordering.cpp b/QueryEngine/FromTableReordering.cpp
index 9f356680a1..71a1b7b3aa 100644
--- a/QueryEngine/FromTableReordering.cpp
+++ b/QueryEngine/FromTableReordering.cpp
@@ -29,18 +29,95 @@ namespace {
 using cost_t = unsigned;
 using node_t = size_t;
 
+const Analyzer::ColumnVar* get_geo_cv(
+    std::vector<const Analyzer::ColumnVar*> const& geo_args,
+    shared::TableKey const& table_key) {
+  auto it = std::find_if(
+      geo_args.begin(), geo_args.end(), [&table_key](const Analyzer::ColumnVar* cv) {
+        return cv->getTableKey() == table_key;
+      });
+  return it == geo_args.end() ? nullptr : *it;
+}
+
 static std::unordered_map<SQLTypes, cost_t> GEO_TYPE_COSTS{{kPOINT, 60},
                                                            {kARRAY, 60},
                                                            {kLINESTRING, 70},
                                                            {kPOLYGON, 80},
                                                            {kMULTIPOLYGON, 90}};
 
+static bool force_table_reordering_st_contain_func(std::string_view target_func_name) {
+  return std::any_of(BoundingBoxIntersectJoinSupportedFunction::
+                         ST_CONTAIN_FORCE_TABLE_REORDERING_TARGET_FUNC.begin(),
+                     BoundingBoxIntersectJoinSupportedFunction::
+                         ST_CONTAIN_FORCE_TABLE_REORDERING_TARGET_FUNC.end(),
+                     [target_func_name](std::string_view func_name) {
+                       return target_func_name == func_name;
+                     });
+}
+
+static bool force_table_reordering_st_intersects_func(std::string_view target_func_name) {
+  return std::any_of(BoundingBoxIntersectJoinSupportedFunction::
+                         ST_INTERSECTS_FORCE_TABLE_REORDERING_TARGET_FUNC.begin(),
+                     BoundingBoxIntersectJoinSupportedFunction::
+                         ST_INTERSECTS_FORCE_TABLE_REORDERING_TARGET_FUNC.end(),
+                     [target_func_name](std::string_view func_name) {
+                       return target_func_name == func_name;
+                     });
+}
+
+bool should_force_table_reordering(shared::TableKey const& inner_arg_key,
+                                   SQLTypes const inner_type,
+                                   shared::TableKey const& outer_arg_key,
+                                   SQLTypes const outer_type,
+                                   std::string const& geo_func_name,
+                                   const std::vector<InputTableInfo>& table_infos) {
+  // if, |R| > |S|
+  // case-1: SELECT ... FROM R, S WHERE ST_...(R.c, S.c);
+  // case-2: SELECT ... FROM R, S WHERE ST_...(S.c, R.c);
+  // case-3: SELECT ... FROM S, R WHERE ST_...(R.c, S.c);
+  // case-4: SELECT ... FROM S, R WHERE ST_...(S.c, R.c);
+  auto const inner_poly_outer_pt_pair =
+      shared::is_any<kPOLYGON, kMULTIPOLYGON>(inner_type) && outer_type == kPOINT;
+  auto const outer_poly_inner_pt_pair =
+      shared::is_any<kPOLYGON, kMULTIPOLYGON>(outer_type) && inner_type == kPOINT;
+  auto const force_swap_st_contains =
+      force_table_reordering_st_contain_func(geo_func_name);
+  auto const force_swap_st_intersects =
+      force_table_reordering_st_intersects_func(geo_func_name);
+  size_t inner_idx = 0;
+  size_t outer_idx = 0;
+  for (size_t i = 0; i < table_infos.size(); i++) {
+    if (table_infos[i].table_key == inner_arg_key) {
+      inner_idx = i;
+    } else if (table_infos[i].table_key == outer_arg_key) {
+      outer_idx = i;
+    }
+  }
+  size_t first_listed_idx = std::min(inner_idx, outer_idx);
+  size_t first_listed_card = table_infos[first_listed_idx].info.getNumTuples();
+  size_t last_listed_idx = std::max(inner_idx, outer_idx);
+  size_t last_listed_card = table_infos[last_listed_idx].info.getNumTuples();
+  if (first_listed_card > last_listed_card) {
+    if (inner_arg_key == table_infos[first_listed_idx].table_key) {
+      // case 1
+      return inner_poly_outer_pt_pair &&
+             (force_swap_st_contains || force_swap_st_intersects);
+    } else {
+      // case 2
+      CHECK_EQ(outer_arg_key, table_infos[first_listed_idx].table_key);
+      return outer_poly_inner_pt_pair && force_swap_st_intersects;
+    }
+  }
+  return false;
+}
+
 // Returns a lhs/rhs cost for the given qualifier. Must be strictly greater than 0.
 // todo (yoonmin): compute the cost of inner join edge and outer join edge
 // Currently, we set 100 for inner join and 200 for loop join
 // for geometries, we use types of geometries as its cost factor
 std::tuple<cost_t, cost_t, InnerQualDecision> get_join_qual_cost(
     const Analyzer::Expr* qual,
+    const std::vector<InputTableInfo>& table_infos,
     const Executor* executor) {
   if (executor) {
     GeospatialFunctionFinder geo_func_finder;
@@ -64,15 +141,8 @@ std::tuple<cost_t, cost_t, InnerQualDecision> get_join_qual_cost(
       // but |R| is not that larger than |S|, i.e., |R| / |S| < 10.0
       // in this case, it might be better if keeping the existing ordering
       // to exploit bounding-box intersection w/ hash join framework instead of loop join
-      const auto& geo_args = geo_func_finder.getGeoArgCvs();
-      const auto inner_cv_it =
-          std::find_if(geo_args.begin(),
-                       geo_args.end(),
-                       [&inner_table_key](const Analyzer::ColumnVar* cv) {
-                         return cv->getTableKey() == inner_table_key;
-                       });
-      CHECK(inner_cv_it != geo_args.end());
-      const auto inner_cv = *inner_cv_it;
+      const auto inner_cv = get_geo_cv(geo_func_finder.getGeoArgCvs(), inner_table_key);
+      CHECK(inner_cv);
       bool needs_table_reordering = inner_table_cardinality < outer_table_cardinality;
       const auto outer_inner_card_ratio =
           outer_table_cardinality / static_cast<double>(inner_table_cardinality);
@@ -96,6 +166,8 @@ std::tuple<cost_t, cost_t, InnerQualDecision> get_join_qual_cost(
             // to avoid too expensive hash join
             // so let's try to set inner table as poly table to invalidate
             // rte index requirement
+            VLOG(1) << "Force loop-join to avoid unexpected overhead of building large "
+                       "hash table";
             return {200, 200, InnerQualDecision::RHS};
           } else {
             // otherwise, try to keep the existing ordering
@@ -104,9 +176,28 @@ std::tuple<cost_t, cost_t, InnerQualDecision> get_join_qual_cost(
         } else {
           // poly is the inner table, so we need to reorder tables to use
           // bbox-intersection
+          auto const geo_func_name = geo_func_finder.getGeoFunctionName();
+          const auto outer_cv =
+              get_geo_cv(geo_func_finder.getGeoArgCvs(), outer_table_key);
+          CHECK(outer_cv);
+          auto const inner_type = inner_cv->get_type_info().get_type();
+          auto const outer_type = outer_cv->get_type_info().get_type();
+          if (!needs_table_reordering && should_force_table_reordering(inner_table_key,
+                                                                       inner_type,
+                                                                       outer_table_key,
+                                                                       outer_type,
+                                                                       geo_func_name,
+                                                                       table_infos)) {
+            VLOG(1) << "Force reordering tables to enable a hash join for "
+                    << geo_func_name;
+            // let's reorder them regardless of table cardinality to build a hash table on
+            // polygon side which can exploit bounding-box intersection
+            return {190, 180, InnerQualDecision::RHS};
+          }
           if (needs_table_reordering) {
             // outer point table is larger than inner poly table, so let's reorder them
             // by table cardinality
+            VLOG(1) << "Try to reorder tables based on table cardinality";
             return {200, 200, InnerQualDecision::RHS};
           } else {
             // otherwise, try to keep the existing ordering
@@ -208,9 +299,8 @@ std::vector<std::map<node_t, cost_t>> build_join_cost_graph(
       qual_nest_levels.erase(qual_nest_levels.begin());
       int rhs_nest_level = *qual_nest_levels.begin();
       CHECK_GE(rhs_nest_level, 0);
-
       // Get the {lhs, rhs} cost for the qual
-      const auto qual_costing = get_join_qual_cost(qual.get(), executor);
+      const auto qual_costing = get_join_qual_cost(qual.get(), table_infos, executor);
       qual_detection_res[lhs_nest_level][rhs_nest_level] = std::get<2>(qual_costing);
       qual_detection_res[rhs_nest_level][lhs_nest_level] = std::get<2>(qual_costing);
       const auto edge_it = join_cost_graph[lhs_nest_level].find(rhs_nest_level);
diff --git a/QueryEngine/GeoIR.cpp b/QueryEngine/GeoIR.cpp
index 854f645dd0..18539f2b16 100644
--- a/QueryEngine/GeoIR.cpp
+++ b/QueryEngine/GeoIR.cpp
@@ -20,6 +20,8 @@
 #include "QueryEngine/GeoOperators/API.h"
 #include "QueryEngine/GeoOperators/Codegen.h"
 
+using heavyai::ErrorCode;
+
 ArrayLoadCodegen CodeGenerator::codegenGeoArrayLoadAndNullcheck(llvm::Value* byte_stream,
                                                                 llvm::Value* pos,
                                                                 const SQLTypeInfo& ti,
@@ -472,7 +474,7 @@ std::vector<llvm::Value*> CodeGenerator::codegenGeosPredicateCall(
   }
   cgen_state_->ir_builder_.CreateCondBr(status_lv, geos_pred_ok_bb, geos_pred_fail_bb);
   cgen_state_->ir_builder_.SetInsertPoint(geos_pred_fail_bb);
-  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt(Executor::ERR_GEOS));
+  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt(int32_t(ErrorCode::GEOS)));
   cgen_state_->needs_error_check_ = true;
   cgen_state_->ir_builder_.SetInsertPoint(geos_pred_ok_bb);
   auto res = cgen_state_->ir_builder_.CreateLoad(
@@ -533,7 +535,7 @@ std::vector<llvm::Value*> CodeGenerator::codegenGeosConstructorCall(
   }
   cgen_state_->ir_builder_.CreateCondBr(status_lv, geos_ok_bb, geos_fail_bb);
   cgen_state_->ir_builder_.SetInsertPoint(geos_fail_bb);
-  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt(Executor::ERR_GEOS));
+  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt(int32_t(ErrorCode::GEOS)));
   cgen_state_->needs_error_check_ = true;
   cgen_state_->ir_builder_.SetInsertPoint(geos_ok_bb);
 
diff --git a/QueryEngine/GeoOperators/Codegen.cpp b/QueryEngine/GeoOperators/Codegen.cpp
index 448c3c8351..8fbf6864e6 100644
--- a/QueryEngine/GeoOperators/Codegen.cpp
+++ b/QueryEngine/GeoOperators/Codegen.cpp
@@ -66,6 +66,12 @@ const Analyzer::Expr* Codegen::getOperand(const size_t index) {
   return operator_->getOperand(index);
 }
 
+char const* Codegen::pointIsNullFunctionName(SQLTypeInfo const& geo_ti) {
+  CHECK_EQ(kPOINT, geo_ti.get_type());
+  return geo_ti.get_compression() == kENCODING_GEOINT ? "point_int32_is_null"
+                                                      : "point_double_is_null";
+}
+
 std::string suffix(SQLTypes type) {
   if (type == kPOINT) {
     return std::string("_Point");
diff --git a/QueryEngine/GeoOperators/Codegen.h b/QueryEngine/GeoOperators/Codegen.h
index 4962614c44..ccf623301b 100644
--- a/QueryEngine/GeoOperators/Codegen.h
+++ b/QueryEngine/GeoOperators/Codegen.h
@@ -63,6 +63,8 @@ class Codegen {
 
   virtual ~Codegen() {}
 
+  static char const* pointIsNullFunctionName(SQLTypeInfo const&);
+
  protected:
   const Analyzer::GeoOperator* operator_;
   bool is_nullable_{true};
diff --git a/QueryEngine/GeoOperators/Distance.h b/QueryEngine/GeoOperators/Distance.h
index 41882cdcb0..974087f525 100644
--- a/QueryEngine/GeoOperators/Distance.h
+++ b/QueryEngine/GeoOperators/Distance.h
@@ -100,16 +100,16 @@ class Distance : public Codegen {
                   cgen_state->llInt(
                       static_cast<int32_t>(inline_int_null_value<int32_t>())));
             }
+            CHECK(operand_is_null_lv);
             is_null = builder.CreateOr(is_null, operand_is_null_lv);
           }
           is_coords_lv = false;
         }
       } else {
-        bool is_coords_lv{true};
         for (size_t j = 0; j < num_physical_coord_lvs; j++) {
           // ptr
           CHECK_LT(arg_lvs_index, arg_lvs.size());
-          auto array_buff_lv = arg_lvs[arg_lvs_index++];
+          auto array_buff_lv = arg_lvs[arg_lvs_index];
           if (j == 0) {
             // cast alloca to i8*
             array_buff_lv = builder.CreateBitCast(
@@ -121,14 +121,25 @@ class Distance : public Codegen {
           }
           operand_lvs.push_back(array_buff_lv);
           if (is_nullable_ && is_coords_lv) {
-            auto coords_array_type =
-                llvm::dyn_cast<llvm::PointerType>(operand_lvs.back()->getType());
-            CHECK(coords_array_type);
-            is_null = builder.CreateOr(
-                is_null,
-                builder.CreateICmpEQ(operand_lvs.back(),
-                                     llvm::ConstantPointerNull::get(coords_array_type)));
+            auto const geo_oper = dynamic_cast<const Analyzer::GeoOperator*>(operand);
+            if (geo_oper && geo_oper->getName() == "ST_Point") {
+              // ST_Point stores null_sentinel to its storage if it is null
+              CHECK_EQ(operand->get_type_info().get_compression(), kENCODING_NONE);
+              auto null_check_lv =
+                  cgen_state->emitCall("point_double_is_null", {arg_lvs[arg_lvs_index]});
+              is_null = builder.CreateOr(is_null, null_check_lv);
+            } else {
+              auto coords_array_type =
+                  llvm::dyn_cast<llvm::PointerType>(operand_lvs.back()->getType());
+              CHECK(coords_array_type);
+              is_null = builder.CreateOr(
+                  is_null,
+                  builder.CreateICmpEQ(
+                      operand_lvs.back(),
+                      llvm::ConstantPointerNull::get(coords_array_type)));
+            }
           }
+          arg_lvs_index++;
           is_coords_lv = false;
           CHECK_LT(arg_lvs_index, arg_lvs.size());
           operand_lvs.push_back(arg_lvs[arg_lvs_index++]);
diff --git a/QueryEngine/GeoOperators/NPoints.h b/QueryEngine/GeoOperators/NPoints.h
index 2ee51a2c6d..9b7744d558 100644
--- a/QueryEngine/GeoOperators/NPoints.h
+++ b/QueryEngine/GeoOperators/NPoints.h
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <regex>
 #include "QueryEngine/GeoOperators/Codegen.h"
 
 namespace spatial_type {
@@ -30,51 +31,80 @@ class NPoints : public Codegen {
 
   const Analyzer::Expr* getOperand(const size_t index) final {
     CHECK_EQ(index, size_t(0));
-    if (operand_owned_) {
-      return operand_owned_.get();
+    if (col_var_owned_) {
+      return col_var_owned_.get();
     }
-
     const auto operand = operator_->getOperand(0);
-    auto col_var = dynamic_cast<const Analyzer::ColumnVar*>(operand);
-    CHECK(col_var);
-
-    geo_ti_ = col_var->get_type_info();
+    geo_ti_ = operand->get_type_info();
     CHECK(geo_ti_.is_geometry());
     is_nullable_ = !geo_ti_.get_notnull();
-
-    // create a new operand which is just the coords and codegen it
-    auto column_key = col_var->getColumnKey();
-    column_key.column_id = column_key.column_id + 1;  // + 1 for coords
-    const auto coords_cd = get_column_descriptor(column_key);
-    CHECK(coords_cd);
-
-    operand_owned_ = std::make_unique<Analyzer::ColumnVar>(
-        coords_cd->columnType, column_key, col_var->get_rte_idx());
-    return operand_owned_.get();
+    if (auto col_var = dynamic_cast<const Analyzer::ColumnVar*>(operand)) {
+      // create a new operand which is just the coords and codegen it
+      auto column_key = col_var->getColumnKey();
+      column_key.column_id = column_key.column_id + 1;  // + 1 for coords
+      const auto coords_cd = get_column_descriptor(column_key);
+      CHECK(coords_cd);
+      col_var_owned_ = std::make_unique<Analyzer::ColumnVar>(
+          coords_cd->columnType, column_key, col_var->get_rte_idx());
+      return col_var_owned_.get();
+    }
+    return operand;
   }
 
   std::tuple<std::vector<llvm::Value*>, llvm::Value*> codegenLoads(
       const std::vector<llvm::Value*>& arg_lvs,
       const std::vector<llvm::Value*>& pos_lvs,
       CgenState* cgen_state) final {
-    CHECK_EQ(arg_lvs.size(), size_t(1));
-    auto& argument_lv = arg_lvs.front();
-    std::string fn_name("array_size");
-
-    const auto& elem_ti = getOperand(0)->get_type_info().get_elem_type();
-    std::vector<llvm::Value*> array_size_args{
-        argument_lv,
-        pos_lvs.front(),
-        cgen_state->llInt(log2_bytes(elem_ti.get_logical_size()))};
-
-    const bool is_nullable = isNullable();
-
-    if (is_nullable) {
-      fn_name += "_nullable";
-      array_size_args.push_back(cgen_state->inlineIntNull(getTypeInfo()));
+    llvm::Value* coords_arr_sz_lv{nullptr};
+    if (auto geo_constant = dynamic_cast<const Analyzer::GeoConstant*>(getOperand(0))) {
+      // count points defined in the WKTString, i.e., POLYGON(1 1, 2 2, 3 3, 1 1)
+      // the validation of the WKTString must be checked before entering this logic
+      std::regex regex("-?[0-9]*\\.?[0-9]+\\s+-?[0-9]*\\.?[0-9]+");
+      auto target = geo_constant->getWKTString();
+      auto pt_begin = std::sregex_iterator(target.begin(), target.end(), regex);
+      auto pt_end = std::sregex_iterator();
+      auto num_pts = std::distance(pt_begin, pt_end);
+      CHECK_GT(num_pts, 0);
+      coords_arr_sz_lv = cgen_state->llInt(16 * num_pts);
+    } else if (arg_lvs.size() == size_t(1)) {
+      std::string fn_name("array_size");
+      bool is_nullable = isNullable();
+      if (auto col_var = dynamic_cast<const Analyzer::ColumnVar*>(getOperand(0))) {
+        auto column_key = col_var->getColumnKey();
+        column_key.column_id = column_key.column_id - 1;
+        const auto type_cd = get_column_descriptor(column_key);
+        CHECK(type_cd);
+        if (type_cd->columnType.get_type() == kPOINT) {
+          fn_name = "point_coord_array_size";
+        }
+      }
+      auto& argument_lv = arg_lvs.front();
+      const auto& elem_ti = getOperand(0)->get_type_info().get_elem_type();
+      std::vector<llvm::Value*> array_size_args{
+          argument_lv,
+          pos_lvs.front(),
+          cgen_state->llInt(log2_bytes(elem_ti.get_logical_size()))};
+      if (is_nullable) {
+        fn_name += "_nullable";
+        array_size_args.push_back(cgen_state->inlineIntNull(getTypeInfo()));
+      }
+      coords_arr_sz_lv = cgen_state->emitExternalCall(
+          fn_name, get_int_type(32, cgen_state->context_), array_size_args);
+    } else if (arg_lvs.size() == size_t(2)) {
+      auto child_geo_oper =
+          dynamic_cast<const Analyzer::GeoOperator*>(operator_->getOperand(0));
+      CHECK(child_geo_oper);
+      if (child_geo_oper->getName() == "ST_Point") {
+        coords_arr_sz_lv = cgen_state->ir_builder_.CreateSelect(
+            cgen_state->emitCall("point_double_is_null", {arg_lvs.front()}),
+            cgen_state->inlineIntNull(getTypeInfo()),
+            cgen_state->llInt(static_cast<int32_t>(16)));
+      } else {
+        CHECK(false) << "Not supported geo operator w/ ST_NPoints: "
+                     << child_geo_oper->getName();
+      }
     }
-    const auto coords_arr_sz_lv = cgen_state->emitExternalCall(
-        fn_name, get_int_type(32, cgen_state->context_), array_size_args);
+    CHECK(coords_arr_sz_lv);
     return std::make_tuple(std::vector<llvm::Value*>{coords_arr_sz_lv}, coords_arr_sz_lv);
   }
 
@@ -105,7 +135,7 @@ class NPoints : public Codegen {
 
  protected:
   SQLTypeInfo geo_ti_;
-  std::unique_ptr<Analyzer::ColumnVar> operand_owned_;
+  std::unique_ptr<Analyzer::ColumnVar> col_var_owned_;
 };
 
 }  // namespace spatial_type
diff --git a/QueryEngine/GeoOperators/PointAccessors.h b/QueryEngine/GeoOperators/PointAccessors.h
index 0f464b9745..2231adc952 100644
--- a/QueryEngine/GeoOperators/PointAccessors.h
+++ b/QueryEngine/GeoOperators/PointAccessors.h
@@ -31,6 +31,12 @@ class PointAccessors : public Codegen {
 
   SQLTypeInfo getNullType() const final { return SQLTypeInfo(kBOOLEAN); }
 
+  llvm::Value* codegenCmpEqNullptr(llvm::IRBuilder<>& builder, llvm::Value* arg_lv) {
+    auto* const ptr_type = llvm::dyn_cast<llvm::PointerType>(arg_lv->getType());
+    CHECK(ptr_type);
+    return builder.CreateICmpEQ(arg_lv, llvm::ConstantPointerNull::get(ptr_type));
+  }
+
   // returns arguments lvs and null lv
   std::tuple<std::vector<llvm::Value*>, llvm::Value*> codegenLoads(
       const std::vector<llvm::Value*>& arg_lvs,
@@ -47,14 +53,8 @@ class PointAccessors : public Codegen {
     llvm::Value* is_null{nullptr};
     if (arg_lvs.size() == 1) {
       if (dynamic_cast<const Analyzer::GeoExpr*>(operand)) {
-        const auto ptr_type =
-            llvm::dyn_cast<llvm::PointerType>(arg_lvs.front()->getType());
-        CHECK(ptr_type);
-        const auto is_null_lv =
-            builder.CreateICmp(llvm::CmpInst::ICMP_EQ,
-                               arg_lvs.front(),
-                               llvm::ConstantPointerNull::get(ptr_type));
-        return std::make_tuple(arg_lvs, is_null_lv);
+        is_null = codegenCmpEqNullptr(builder, arg_lvs.front());
+        return std::make_tuple(arg_lvs, is_null);
       }
       // col byte stream, get the array buffer ptr and is null attributes and cache
       auto arr_load_lvs = CodeGenerator::codegenGeoArrayLoadAndNullcheck(
@@ -65,16 +65,17 @@ class PointAccessors : public Codegen {
       // ptr and size
       CHECK_EQ(arg_lvs.size(), size_t(2));
       if (dynamic_cast<const Analyzer::GeoOperator*>(operand)) {
-        // null check will be if the ptr is a nullptr
-        is_null = builder.CreateICmp(
-            llvm::CmpInst::ICMP_EQ,
-            arg_lvs.front(),
-            llvm::ConstantPointerNull::get(  // TODO: check ptr address space
-                geo_ti.get_compression() == kENCODING_GEOINT
-                    ? llvm::Type::getInt32PtrTy(cgen_state->context_)
-                    : llvm::Type::getDoublePtrTy(cgen_state->context_)));
+        if (geo_ti.get_type() == kPOINT && !geo_ti.is_variable_size()) {
+          char const* const fname = pointIsNullFunctionName(geo_ti);
+          is_null = cgen_state->emitCall(fname, {arg_lvs.front()});
+        } else {
+          // The above branch tests for both nullptr and null sentinel, whereas this
+          // branch only tests for nullptr. If not for this branch, the GeospatialTest
+          // LLVMOptimization test fails due to non-removal of the
+          // decompress_{x,y}_coord_geoint function call in the generated IR. See QE-1007.
+          is_null = codegenCmpEqNullptr(builder, arg_lvs.front());
+        }
       }
-
       // TODO: nulls from other types not yet supported
       array_buff_ptr = arg_lvs.front();
     }
diff --git a/QueryEngine/GeoOperators/PointConstructor.h b/QueryEngine/GeoOperators/PointConstructor.h
index fd80cdd063..5718c18680 100644
--- a/QueryEngine/GeoOperators/PointConstructor.h
+++ b/QueryEngine/GeoOperators/PointConstructor.h
@@ -92,41 +92,12 @@ class PointConstructor : public Codegen {
                        // function as position argumnts are not used here
     CHECK_EQ(arg_lvs.size(), size_t(2));
 
-    auto& builder = cgen_state->ir_builder_;
-
-    llvm::Value* is_null{nullptr};
-    auto x_operand = getOperand(0);
-    const auto& x_ti = x_operand->get_type_info();
-    if (!x_ti.get_notnull()) {
-      CHECK(x_ti.is_integer() || x_ti.is_fp());
-      // TODO: centralize nullcheck logic for all sqltypes
-      is_null = x_ti.is_integer()
-                    ? builder.CreateICmp(llvm::CmpInst::ICMP_EQ,
-                                         arg_lvs.front(),
-                                         cgen_state->llInt(inline_int_null_val(x_ti)))
-                    : builder.CreateFCmp(llvm::FCmpInst::FCMP_OEQ,
-                                         arg_lvs.front(),
-                                         cgen_state->llFp(inline_fp_null_val(x_ti)));
-    }
-
-    auto y_operand = getOperand(1);
-    const auto& y_ti = y_operand->get_type_info();
-    if (!y_ti.get_notnull()) {
-      auto y_is_null =
-          y_ti.is_integer()
-              ? builder.CreateICmp(llvm::CmpInst::ICMP_EQ,
-                                   arg_lvs.front(),
-                                   cgen_state->llInt(inline_int_null_val(y_ti)))
-              : builder.CreateFCmp(llvm::FCmpInst::FCMP_OEQ,
-                                   arg_lvs.front(),
-                                   cgen_state->llFp(inline_fp_null_val(y_ti)));
-      if (is_null) {
-        // the point is null if at least one of its coordinate has null value
-        is_null = builder.CreateOr(is_null, y_is_null);
-      } else {
-        is_null = y_is_null;
-      }
-    }
+    // TODO(adb): centralize nullcheck logic for all sqltypes
+    llvm::Value* const x_is_null = codegenOperandIsNull(0u, arg_lvs[0], cgen_state);
+    llvm::Value* const y_is_null = codegenOperandIsNull(1u, arg_lvs[1], cgen_state);
+    llvm::Value* const is_null =
+        x_is_null && y_is_null ? cgen_state->ir_builder_.CreateOr(x_is_null, y_is_null)
+                               : std::max(x_is_null, y_is_null);
 
     if (is_nullable_ && !is_null) {
       // if the inputs are not null, set the output to be not null
@@ -147,8 +118,8 @@ class PointConstructor : public Codegen {
       auto elem_ty = llvm::Type::getDoubleTy(cgen_state->context_);
       arr_type = llvm::ArrayType::get(elem_ty, 2);
     }
-    pt_local_storage_lv_ =
-        builder.CreateAlloca(arr_type, nullptr, operator_->getName() + "_Local_Storage");
+    pt_local_storage_lv_ = cgen_state->ir_builder_.CreateAlloca(
+        arr_type, nullptr, operator_->getName() + "_Local_Storage");
 
     return std::make_tuple(arg_lvs, is_null);
   }
@@ -213,6 +184,24 @@ class PointConstructor : public Codegen {
   }
 
  private:
+  llvm::Value* codegenOperandIsNull(size_t idx,
+                                    llvm::Value* value,
+                                    CgenState* cgen_state) {
+    auto const& ti = getOperand(idx)->get_type_info();
+    if (ti.get_notnull()) {
+      return nullptr;
+    } else if (ti.is_integer()) {
+      return cgen_state->ir_builder_.CreateICmpEQ(
+          value, cgen_state->llInt(inline_int_null_val(ti)));
+    } else if (ti.is_fp()) {
+      return cgen_state->ir_builder_.CreateFCmpOEQ(
+          value, cgen_state->llFp(inline_fp_null_val(ti)));
+    } else {
+      UNREACHABLE() << "Type is expected to be integer or floating point.";
+      return {};
+    }
+  }
+
   llvm::AllocaInst* pt_local_storage_lv_{nullptr};
 };
 
diff --git a/QueryEngine/GeoOperators/PointN.h b/QueryEngine/GeoOperators/PointN.h
index 3f7adff4e8..3e5cebdce1 100644
--- a/QueryEngine/GeoOperators/PointN.h
+++ b/QueryEngine/GeoOperators/PointN.h
@@ -51,6 +51,52 @@ class PointN : public Codegen {
     return SQLTypeInfo(kBOOLEAN);
   }
 
+  llvm::Value* codegenGeoSize(CgenState* cgen_state,
+                              SQLTypeInfo const& geo_ti,
+                              const std::vector<llvm::Value*>& arg_lvs,
+                              const std::vector<llvm::Value*>& pos_lvs) {
+    llvm::Value* geo_size_lv{nullptr};
+    if (arg_lvs.size() == 2) {
+      const bool is_nullable = !geo_ti.get_notnull();
+      std::string size_fn_name = "array_size";
+      if (is_nullable) {
+        size_fn_name += "_nullable";
+      }
+
+      uint32_t elem_sz = 1;  // TINYINT coords array
+      std::vector<llvm::Value*> array_sz_args{
+          arg_lvs.front(), pos_lvs.front(), cgen_state->llInt(log2_bytes(elem_sz))};
+      if (is_nullable) {
+        array_sz_args.push_back(
+            cgen_state->llInt(static_cast<int32_t>(inline_int_null_value<int32_t>())));
+      }
+      geo_size_lv = cgen_state->emitExternalCall(
+          size_fn_name, get_int_type(32, cgen_state->context_), array_sz_args);
+    } else {
+      geo_size_lv = arg_lvs[1];
+    }
+    CHECK(geo_size_lv);
+    return geo_size_lv;
+  }
+
+  llvm::Value* codegenIndexOutOfBoundCheck(CgenState* cgen_state,
+                                           llvm::Value* index_lv,
+                                           llvm::Value* geosize_lv) {
+    llvm::Value* is_null_lv = cgen_state->llBool(false);
+    is_null_lv = cgen_state->ir_builder_.CreateOr(
+        is_null_lv,
+        cgen_state->ir_builder_.CreateICmp(llvm::ICmpInst::ICMP_SLT,
+                                           index_lv,
+                                           cgen_state->llInt(static_cast<int32_t>(0))));
+    return cgen_state->ir_builder_.CreateOr(
+        is_null_lv,
+        cgen_state->ir_builder_.CreateICmp(
+            llvm::ICmpInst::ICMP_SGE,
+            cgen_state->ir_builder_.CreateMul(index_lv,
+                                              cgen_state->llInt(static_cast<int32_t>(8))),
+            geosize_lv));
+  }
+
   // returns arguments lvs and null lv
   std::tuple<std::vector<llvm::Value*>, llvm::Value*> codegenLoads(
       const std::vector<llvm::Value*>& arg_lvs,
@@ -67,57 +113,42 @@ class PointN : public Codegen {
 
     std::vector<llvm::Value*> array_operand_lvs;
     CHECK(!arg_lvs.empty());
-    auto index_lv = builder.CreateMul(
-        builder.CreateSub(arg_lvs.back(), cgen_state->llInt(static_cast<int32_t>(1))),
-        cgen_state->llInt(static_cast<int32_t>(2)));
-    llvm::Value* is_null_lv{nullptr};
+    llvm::Value* raw_index_lv = arg_lvs.back();
+    llvm::Value* geo_size_lv = codegenGeoSize(cgen_state, geo_ti, arg_lvs, pos_lvs);
+    llvm::Value* pt_size_lv = cgen_state->llInt(16);
+    llvm::Value* num_pts_lv = builder.CreateUDiv(geo_size_lv, pt_size_lv);
+    llvm::Value* is_negative_lv =
+        builder.CreateICmpSLT(raw_index_lv, cgen_state->llInt(0));
+    llvm::Value* negative_raw_index_lv = builder.CreateAdd(raw_index_lv, num_pts_lv);
+    llvm::Value* positive_raw_index_lv =
+        builder.CreateSub(raw_index_lv, cgen_state->llInt(1));
+    raw_index_lv = builder.CreateSelect(
+        is_negative_lv, negative_raw_index_lv, positive_raw_index_lv);
+    raw_index_lv =
+        builder.CreateMul(raw_index_lv, cgen_state->llInt(static_cast<int32_t>(2)));
+    llvm::Value* is_null_lv =
+        codegenIndexOutOfBoundCheck(cgen_state, raw_index_lv, geo_size_lv);
     if (arg_lvs.size() == 2) {
       // col byte stream from column on disk
       array_operand_lvs.push_back(
           cgen_state->emitExternalCall("array_buff",
                                        llvm::Type::getInt8PtrTy(cgen_state->context_),
                                        {arg_lvs.front(), pos_lvs.front()}));
-      const bool is_nullable = !geo_ti.get_notnull();
-      std::string size_fn_name = "array_size";
-      if (is_nullable) {
-        size_fn_name += "_nullable";
-      }
-
-      uint32_t elem_sz = 1;  // TINYINT coords array
-      std::vector<llvm::Value*> array_sz_args{
-          arg_lvs.front(), pos_lvs.front(), cgen_state->llInt(log2_bytes(elem_sz))};
-      if (is_nullable) {
-        array_sz_args.push_back(
-            cgen_state->llInt(static_cast<int32_t>(inline_int_null_value<int32_t>())));
-      }
-      array_operand_lvs.push_back(cgen_state->emitExternalCall(
-          size_fn_name, get_int_type(32, cgen_state->context_), array_sz_args));
-
-      auto geo_size_lv = array_operand_lvs.back();
+      array_operand_lvs.push_back(geo_size_lv);
       // convert the index to a byte index
-      const auto outside_linestring_bounds_lv = builder.CreateNot(builder.CreateICmp(
-          llvm::ICmpInst::ICMP_SLT,
-          builder.CreateMul(index_lv, cgen_state->llInt(static_cast<int32_t>(8))),
-          geo_size_lv));
-      outside_linestring_bounds_lv->setName("outside_linestring_bounds");
+      raw_index_lv =
+          builder.CreateMul(raw_index_lv, cgen_state->llInt(static_cast<int32_t>(8)));
       const auto input_is_null_lv = builder.CreateICmp(
           llvm::ICmpInst::ICMP_EQ,
           geo_size_lv,
           cgen_state->llInt(static_cast<int32_t>(inline_int_null_value<int32_t>())));
-      input_is_null_lv->setName("input_is_null");
-      is_null_lv = builder.CreateOr(outside_linestring_bounds_lv, input_is_null_lv);
+      is_null_lv = builder.CreateOr(is_null_lv, input_is_null_lv);
     } else {
       CHECK_EQ(arg_lvs.size(), size_t(3));  // ptr, size, index
       array_operand_lvs.push_back(arg_lvs[0]);
       array_operand_lvs.push_back(arg_lvs[1]);
-
-      const auto geo_size_lv = arg_lvs[1];
-      // TODO: bounds indices are 64 bits but should be 32 bits, as array length is
-      // limited to 32 bits
-      is_null_lv = builder.CreateNot(
-          builder.CreateICmp(llvm::ICmpInst::ICMP_SLT, index_lv, geo_size_lv));
     }
-    array_operand_lvs.push_back(index_lv);
+    array_operand_lvs.push_back(raw_index_lv);
     return std::make_tuple(array_operand_lvs, is_null_lv);
   }
 
diff --git a/QueryEngine/GeoOperators/Transform.h b/QueryEngine/GeoOperators/Transform.h
index 1440e7667c..a94aee474c 100644
--- a/QueryEngine/GeoOperators/Transform.h
+++ b/QueryEngine/GeoOperators/Transform.h
@@ -70,19 +70,10 @@ class Transform : public Codegen {
             arg_lvs.size() == size_t(2));  // ptr or ptr, size
       // coming from a temporary, can modify the memory pointer directly
       can_transform_in_place_ = true;
-      auto& builder = cgen_state->ir_builder_;
-
-      const auto is_null = builder.CreateICmp(
-          llvm::CmpInst::ICMP_EQ,
-          arg_lvs.front(),
-          llvm::ConstantPointerNull::get(  // TODO: check ptr address space
-              operand_ti.get_compression() == kENCODING_GEOINT
-                  ? llvm::Type::getInt32PtrTy(cgen_state->context_)
-                  : llvm::Type::getDoublePtrTy(cgen_state->context_)));
+      char const* const fname = pointIsNullFunctionName(operand_ti);
+      llvm::Value* const is_null = cgen_state->emitCall(fname, {arg_lvs.front()});
       return std::make_tuple(std::vector<llvm::Value*>{arg_lvs.front()}, is_null);
     }
-    UNREACHABLE();
-    return std::make_tuple(std::vector<llvm::Value*>{}, nullptr);
   }
 
   std::vector<llvm::Value*> codegen(const std::vector<llvm::Value*>& args,
diff --git a/QueryEngine/GroupByAndAggregate.cpp b/QueryEngine/GroupByAndAggregate.cpp
index 1483a25ae0..dcdd033f13 100644
--- a/QueryEngine/GroupByAndAggregate.cpp
+++ b/QueryEngine/GroupByAndAggregate.cpp
@@ -54,7 +54,10 @@ bool g_cluster{false};
 bool g_bigint_count{false};
 int g_hll_precision_bits{11};
 size_t g_watchdog_baseline_max_groups{120000000};
+extern size_t g_approx_quantile_buffer;
+extern size_t g_approx_quantile_centroids;
 extern int64_t g_bitmap_memory_limit;
+extern size_t g_default_max_groups_buffer_entry_guess;
 extern size_t g_leaf_count;
 
 bool ColRangeInfo::isEmpty() const {
@@ -888,6 +891,25 @@ std::unique_ptr<QueryMemoryDescriptor> GroupByAndAggregate::initQueryMemoryDescr
   return query_mem_desc;
 }
 
+ApproxQuantileDescriptors GroupByAndAggregate::initApproxQuantileDescriptors() {
+  // Count APPROX_QUANTILE targets
+  size_t target_count = 0u;
+  auto count_target = [&](Analyzer::AggExpr const*, size_t) { ++target_count; };
+  ra_exe_unit_.eachAggTarget<kAPPROX_QUANTILE>(count_target);
+  if (target_count == 0u) {
+    return {};
+  }
+
+  // Reserve and fill descriptors
+  std::vector<ApproxQuantileDescriptor> descriptors;
+  descriptors.reserve(target_count);
+  auto add_descriptor = [&](Analyzer::AggExpr const*, size_t) {
+    descriptors.push_back({g_approx_quantile_buffer, g_approx_quantile_centroids});
+  };
+  ra_exe_unit_.eachAggTarget<kAPPROX_QUANTILE>(add_descriptor);
+  return descriptors;
+}
+
 std::unique_ptr<QueryMemoryDescriptor> GroupByAndAggregate::initQueryMemoryDescriptorImpl(
     const bool allow_multifrag,
     const size_t max_groups_buffer_entry_count,
@@ -936,6 +958,7 @@ std::unique_ptr<QueryMemoryDescriptor> GroupByAndAggregate::initQueryMemoryDescr
 
   const auto count_distinct_descriptors = init_count_distinct_descriptors(
       ra_exe_unit_, query_infos_, col_range_info, device_type_, executor_);
+  auto approx_quantile_descriptors = initApproxQuantileDescriptors();
   try {
     return QueryMemoryDescriptor::init(executor_,
                                        ra_exe_unit_,
@@ -949,6 +972,7 @@ std::unique_ptr<QueryMemoryDescriptor> GroupByAndAggregate::initQueryMemoryDescr
                                        shard_count,
                                        max_groups_buffer_entry_count,
                                        render_info,
+                                       approx_quantile_descriptors,
                                        count_distinct_descriptors,
                                        must_use_baseline_sort,
                                        output_columnar_hint,
@@ -968,6 +992,7 @@ std::unique_ptr<QueryMemoryDescriptor> GroupByAndAggregate::initQueryMemoryDescr
                                        shard_count,
                                        max_groups_buffer_entry_count,
                                        render_info,
+                                       approx_quantile_descriptors,
                                        count_distinct_descriptors,
                                        must_use_baseline_sort,
                                        output_columnar_hint,
diff --git a/QueryEngine/GroupByAndAggregate.h b/QueryEngine/GroupByAndAggregate.h
index 6c959be3a7..9ceb96bd26 100644
--- a/QueryEngine/GroupByAndAggregate.h
+++ b/QueryEngine/GroupByAndAggregate.h
@@ -80,6 +80,8 @@ class GroupByAndAggregate {
  private:
   bool gpuCanHandleOrderEntries(const std::list<Analyzer::OrderEntry>& order_entries);
 
+  ApproxQuantileDescriptors initApproxQuantileDescriptors();
+
   std::unique_ptr<QueryMemoryDescriptor> initQueryMemoryDescriptor(
       const bool allow_multifrag,
       const size_t max_groups_buffer_entry_count,
diff --git a/QueryEngine/IRCodegen.cpp b/QueryEngine/IRCodegen.cpp
index 638787ed52..9aa057066f 100644
--- a/QueryEngine/IRCodegen.cpp
+++ b/QueryEngine/IRCodegen.cpp
@@ -487,7 +487,7 @@ llvm::Value* CodeGenerator::codegenWidthBucketExpr(const Analyzer::WidthBucketEx
                                         width_bucket_partition_count_ok_bb);
   cgen_state_->ir_builder_.SetInsertPoint(width_bucket_argument_check_fail_bb);
   cgen_state_->ir_builder_.CreateRet(
-      cgen_state_->llInt(Executor::ERR_WIDTH_BUCKET_INVALID_ARGUMENT));
+      cgen_state_->llInt(int32_t(heavyai::ErrorCode::WIDTH_BUCKET_INVALID_ARGUMENT)));
   cgen_state_->ir_builder_.SetInsertPoint(width_bucket_partition_count_ok_bb);
 
   llvm::BasicBlock* width_bucket_bound_check_ok_bb =
@@ -736,6 +736,7 @@ std::vector<JoinLoop> Executor::buildJoinLoops(
                   co, current_hash_table_idx);
               domain.values_buffer = matching_set.elements;
               domain.element_count = matching_set.count;
+              domain.error_code = matching_set.error_code;
               return domain;
             },
             /*outer_condition_match=*/
diff --git a/QueryEngine/InValuesBitmap.cpp b/QueryEngine/InValuesBitmap.cpp
index fac32e1246..db4728de59 100644
--- a/QueryEngine/InValuesBitmap.cpp
+++ b/QueryEngine/InValuesBitmap.cpp
@@ -24,6 +24,7 @@
 #include "../Shared/checked_alloc.h"
 #include "GroupByAndAggregate.h"
 #include "Logger/Logger.h"
+#include "QueryEngine/CodegenHelper.h"
 #include "QueryEngine/QueryEngine.h"
 #include "RuntimeFunctions.h"
 
@@ -35,12 +36,14 @@ InValuesBitmap::InValuesBitmap(const std::vector<int64_t>& values,
                                const int64_t null_val,
                                const Data_Namespace::MemoryLevel memory_level,
                                const int device_count,
-                               Data_Namespace::DataMgr* data_mgr)
+                               Data_Namespace::DataMgr* data_mgr,
+                               CompilationOptions const& co)
     : rhs_has_null_(false)
     , null_val_(null_val)
     , memory_level_(memory_level)
     , device_count_(device_count)
-    , data_mgr_(data_mgr) {
+    , data_mgr_(data_mgr)
+    , co_(co) {
 #ifdef HAVE_CUDA
   CHECK(memory_level_ == Data_Namespace::CPU_LEVEL ||
         memory_level == Data_Namespace::GPU_LEVEL);
@@ -120,10 +123,50 @@ InValuesBitmap::~InValuesBitmap() {
   }
 }
 
+InValuesBitmap::BitIsSetParams InValuesBitmap::prepareBitIsSetParams(
+    Executor* executor,
+    std::vector<std::shared_ptr<const Analyzer::Constant>> const& constant_owned) const {
+  BitIsSetParams params;
+  auto pi8_ty =
+      llvm::PointerType::get(get_int_type(8, executor->cgen_state_->context_), 0);
+  CodeGenerator code_generator(executor);
+  params.null_val_lv =
+      CodegenUtil::hoistLiteral(
+          &code_generator, co_, make_datum<int64_t>(null_val_), kBIGINT, device_count_)
+          .front();
+  if (bitsets_.empty()) {
+    auto const zero_lvs = CodegenUtil::hoistLiteral(
+        &code_generator, co_, make_datum<int64_t>(0), kBIGINT, device_count_);
+    params.min_val_lv = zero_lvs.front();
+    params.max_val_lv = zero_lvs.front();
+    params.bitmap_ptr_lv =
+        executor->cgen_state_->ir_builder_.CreateIntToPtr(zero_lvs.front(), pi8_ty);
+  } else {
+    params.min_val_lv =
+        CodegenUtil::hoistLiteral(
+            &code_generator, co_, make_datum<int64_t>(min_val_), kBIGINT, device_count_)
+            .front();
+    params.max_val_lv =
+        CodegenUtil::hoistLiteral(
+            &code_generator, co_, make_datum<int64_t>(max_val_), kBIGINT, device_count_)
+            .front();
+    auto to_raw_ptr = [](const auto& ptr) { return ptr.get(); };
+    auto begin = boost::make_transform_iterator(constant_owned.begin(), to_raw_ptr);
+    auto end = boost::make_transform_iterator(constant_owned.end(), to_raw_ptr);
+    std::vector<const Analyzer::Constant*> bitmap_constants(begin, end);
+    const auto bitset_handle_lvs =
+        code_generator.codegenHoistedConstants(bitmap_constants, kENCODING_NONE, {});
+    CHECK_EQ(size_t(1), bitset_handle_lvs.size());
+    params.bitmap_ptr_lv = executor->cgen_state_->ir_builder_.CreateIntToPtr(
+        bitset_handle_lvs.front(), pi8_ty);
+  }
+  return params;
+}
+
 llvm::Value* InValuesBitmap::codegen(llvm::Value* needle, Executor* executor) const {
-  AUTOMATIC_IR_METADATA(executor->cgen_state_.get());
+  auto cgen_state = executor->getCgenStatePtr();
+  AUTOMATIC_IR_METADATA(cgen_state);
   std::vector<std::shared_ptr<const Analyzer::Constant>> constants_owned;
-  std::vector<const Analyzer::Constant*> constants;
   for (const auto bitset : bitsets_) {
     const int64_t bitset_handle = reinterpret_cast<int64_t>(bitset);
     const auto bitset_handle_literal = std::dynamic_pointer_cast<Analyzer::Constant>(
@@ -131,38 +174,18 @@ llvm::Value* InValuesBitmap::codegen(llvm::Value* needle, Executor* executor) co
     CHECK(bitset_handle_literal);
     CHECK_EQ(kENCODING_NONE, bitset_handle_literal->get_type_info().get_compression());
     constants_owned.push_back(bitset_handle_literal);
-    constants.push_back(bitset_handle_literal.get());
   }
-  const auto needle_i64 = executor->cgen_state_->castToTypeIn(needle, 64);
+  const auto needle_i64 = cgen_state->castToTypeIn(needle, 64);
   const auto null_bool_val =
       static_cast<int8_t>(inline_int_null_val(SQLTypeInfo(kBOOLEAN, false)));
-  auto pi8_ty =
-      llvm::PointerType::get(get_int_type(8, executor->cgen_state_->context_), 0);
-  if (bitsets_.empty()) {
-    auto empty_bitmap = executor->cgen_state_->llInt(int64_t(0));
-    auto empty_bitmap_ptr =
-        executor->cgen_state_->ir_builder_.CreateIntToPtr(empty_bitmap, pi8_ty);
-    return executor->cgen_state_->emitCall("bit_is_set",
-                                           {empty_bitmap_ptr,
-                                            needle_i64,
-                                            executor->cgen_state_->llInt(int64_t(0)),
-                                            executor->cgen_state_->llInt(int64_t(0)),
-                                            executor->cgen_state_->llInt(null_val_),
-                                            executor->cgen_state_->llInt(null_bool_val)});
-  }
-  CodeGenerator code_generator(executor);
-  const auto bitset_handle_lvs =
-      code_generator.codegenHoistedConstants(constants, kENCODING_NONE, {});
-  CHECK_EQ(size_t(1), bitset_handle_lvs.size());
-  auto bitset_ptr = executor->cgen_state_->ir_builder_.CreateIntToPtr(
-      bitset_handle_lvs.front(), pi8_ty);
-  return executor->cgen_state_->emitCall("bit_is_set",
-                                         {bitset_ptr,
-                                          needle_i64,
-                                          executor->cgen_state_->llInt(min_val_),
-                                          executor->cgen_state_->llInt(max_val_),
-                                          executor->cgen_state_->llInt(null_val_),
-                                          executor->cgen_state_->llInt(null_bool_val)});
+  auto const func_params = prepareBitIsSetParams(executor, constants_owned);
+  return cgen_state->emitCall("bit_is_set",
+                              {func_params.bitmap_ptr_lv,
+                               needle_i64,
+                               func_params.min_val_lv,
+                               func_params.max_val_lv,
+                               func_params.null_val_lv,
+                               cgen_state->llInt(null_bool_val)});
 }
 
 bool InValuesBitmap::isEmpty() const {
diff --git a/QueryEngine/InValuesBitmap.h b/QueryEngine/InValuesBitmap.h
index 7575e24fb3..440044957e 100644
--- a/QueryEngine/InValuesBitmap.h
+++ b/QueryEngine/InValuesBitmap.h
@@ -24,6 +24,7 @@
 #define QUERYENGINE_INVALUESBITMAP_H
 
 #include "../DataMgr/DataMgr.h"
+#include "ThriftHandler/CommandLineOptions.h"
 
 #include <llvm/IR/Value.h>
 
@@ -44,7 +45,8 @@ class InValuesBitmap {
                  const int64_t null_val,
                  const Data_Namespace::MemoryLevel memory_level,
                  const int device_count,
-                 Data_Namespace::DataMgr* data_mgr);
+                 Data_Namespace::DataMgr* data_mgr,
+                 CompilationOptions const& co);
   ~InValuesBitmap();
 
   llvm::Value* codegen(llvm::Value* needle, Executor* executor) const;
@@ -55,6 +57,17 @@ class InValuesBitmap {
 
   size_t gpuBuffers() const { return gpu_buffers_.size(); }
 
+  struct BitIsSetParams {
+    llvm::Value* bitmap_ptr_lv;
+    llvm::Value* min_val_lv;
+    llvm::Value* max_val_lv;
+    llvm::Value* null_val_lv;
+  };
+
+  BitIsSetParams prepareBitIsSetParams(
+      Executor* executor,
+      std::vector<std::shared_ptr<const Analyzer::Constant>> const& constant_owned) const;
+
  private:
   std::vector<Data_Namespace::AbstractBuffer*> gpu_buffers_;
   std::vector<int8_t*> bitsets_;
@@ -65,6 +78,7 @@ class InValuesBitmap {
   const Data_Namespace::MemoryLevel memory_level_;
   const int device_count_;
   Data_Namespace::DataMgr* data_mgr_;
+  CompilationOptions co_;
 };
 
 #endif  // QUERYENGINE_INVALUESBITMAP_H
diff --git a/QueryEngine/InValuesIR.cpp b/QueryEngine/InValuesIR.cpp
index 674d8d9fb6..8ec9949c4e 100644
--- a/QueryEngine/InValuesIR.cpp
+++ b/QueryEngine/InValuesIR.cpp
@@ -91,7 +91,8 @@ llvm::Value* CodeGenerator::codegen(const Analyzer::InIntegerSet* in_integer_set
       co.device_type == ExecutorDeviceType::GPU ? Data_Namespace::GPU_LEVEL
                                                 : Data_Namespace::CPU_LEVEL,
       executor()->deviceCount(co.device_type),
-      executor()->data_mgr_);
+      executor()->data_mgr_,
+      co);
   const auto& in_integer_set_ti = in_integer_set->get_type_info();
   CHECK(in_integer_set_ti.is_boolean());
   const auto lhs_lvs = codegen(in_arg, true, co);
@@ -197,7 +198,8 @@ std::unique_ptr<InValuesBitmap> CodeGenerator::createInValuesBitmap(
                                                   ? Data_Namespace::GPU_LEVEL
                                                   : Data_Namespace::CPU_LEVEL,
                                               executor()->deviceCount(co.device_type),
-                                              executor()->data_mgr_);
+                                              executor()->data_mgr_,
+                                              co);
     } catch (...) {
       return nullptr;
     }
diff --git a/QueryEngine/InputMetadata.cpp b/QueryEngine/InputMetadata.cpp
index 7269b7ecc2..e43555e3d0 100644
--- a/QueryEngine/InputMetadata.cpp
+++ b/QueryEngine/InputMetadata.cpp
@@ -358,6 +358,13 @@ ChunkMetadataMap synthesize_metadata_table_function(const ResultSet* rows) {
   return chunk_metadata_map;
 }
 
+namespace {
+union Number64 {
+  double as_double;
+  int64_t as_int64;
+};
+}  // namespace
+
 ChunkMetadataMap synthesize_metadata(const ResultSet* rows) {
   auto timer = DEBUG_TIMER(__func__);
   ChunkMetadataMap metadata_map;
@@ -394,40 +401,54 @@ ChunkMetadataMap synthesize_metadata(const ResultSet* rows) {
   }
   rows->moveToBegin();
 
+  std::vector<SQLTypeInfo> row_col_ti;
+  std::vector<Number64> col_null_vals(rows->colCount());
+  for (size_t i = 0; i < rows->colCount(); i++) {
+    auto const col_ti = rows->getColType(i);
+    row_col_ti.push_back(col_ti);
+    if (uses_int_meta(col_ti)) {
+      col_null_vals[i].as_int64 = inline_int_null_val(col_ti);
+    } else if (col_ti.is_fp()) {
+      col_null_vals[i].as_double = inline_fp_null_val(col_ti);
+    } else {
+      throw std::runtime_error(col_ti.get_type_name() +
+                               " is not supported in temporary table.");
+    }
+  }
+
   // Code in the do_work lambda runs for and processes each row.
-  const auto do_work = [rows](const std::vector<TargetValue>& crt_row,
-                              std::vector<std::unique_ptr<Encoder>>& dummy_encoders) {
+  const auto do_work = [rows, &row_col_ti, &col_null_vals](
+                           const std::vector<TargetValue>& crt_row,
+                           std::vector<std::unique_ptr<Encoder>>& dummy_encoders) {
     for (size_t i = 0; i < rows->colCount(); ++i) {
-      const auto& col_ti = rows->getColType(i);
+      const auto& col_ti = row_col_ti[i];
       const auto& col_val = crt_row[i];
       const auto scalar_col_val = boost::get<ScalarTargetValue>(&col_val);
       CHECK(scalar_col_val);
       if (uses_int_meta(col_ti)) {
         const auto i64_p = boost::get<int64_t>(scalar_col_val);
         CHECK(i64_p);
-        dummy_encoders[i]->updateStats(*i64_p, *i64_p == inline_int_null_val(col_ti));
-      } else if (col_ti.is_fp()) {
+        dummy_encoders[i]->updateStats(*i64_p, *i64_p == col_null_vals[i].as_int64);
+      } else {
+        CHECK(col_ti.is_fp());
         switch (col_ti.get_type()) {
           case kFLOAT: {
             const auto float_p = boost::get<float>(scalar_col_val);
             CHECK(float_p);
             dummy_encoders[i]->updateStats(*float_p,
-                                           *float_p == inline_fp_null_val(col_ti));
+                                           *float_p == col_null_vals[i].as_double);
             break;
           }
           case kDOUBLE: {
             const auto double_p = boost::get<double>(scalar_col_val);
             CHECK(double_p);
             dummy_encoders[i]->updateStats(*double_p,
-                                           *double_p == inline_fp_null_val(col_ti));
+                                           *double_p == col_null_vals[i].as_double);
             break;
           }
           default:
             CHECK(false);
         }
-      } else {
-        throw std::runtime_error(col_ti.get_type_name() +
-                                 " is not supported in temporary table.");
       }
     }
   };
diff --git a/QueryEngine/JoinHashTable/BoundingBoxIntersectJoinHashTable.cpp b/QueryEngine/JoinHashTable/BoundingBoxIntersectJoinHashTable.cpp
index 6261077e42..ec1f3c6138 100644
--- a/QueryEngine/JoinHashTable/BoundingBoxIntersectJoinHashTable.cpp
+++ b/QueryEngine/JoinHashTable/BoundingBoxIntersectJoinHashTable.cpp
@@ -26,6 +26,7 @@
 #include "QueryEngine/JoinHashTable/RangeJoinHashTable.h"
 #include "QueryEngine/JoinHashTable/Runtime/HashJoinKeyHandlers.h"
 #include "QueryEngine/JoinHashTable/Runtime/JoinHashTableGpuUtils.h"
+#include "QueryEngine/enums.h"
 
 std::unique_ptr<HashtableRecycler> BoundingBoxIntersectJoinHashTable::hash_table_cache_ =
     std::make_unique<HashtableRecycler>(CacheItemType::BBOX_INTERSECT_HT,
@@ -1669,11 +1670,7 @@ HashJoinMatchingSet BoundingBoxIntersectJoinHashTable::codegenMatchingSet(
     one_to_many_ptr =
         LL_BUILDER.CreateAdd(one_to_many_ptr, LL_INT(composite_key_dict_size));
 
-    // NOTE(jclay): A fixed array of size 200 is allocated on the stack.
-    // this is likely the maximum value we can do that is safe to use across
-    // all supported GPU architectures.
-    const int max_array_size = 200;
-    const auto arr_type = get_int_array_type(32, max_array_size, LL_CONTEXT);
+    const auto arr_type = get_int_array_type(32, kMaxBBoxOverlapsCount, LL_CONTEXT);
     const auto out_arr_lv = LL_BUILDER.CreateAlloca(arr_type);
     out_arr_lv->setName("out_arr");
 
@@ -1685,27 +1682,32 @@ HashJoinMatchingSet BoundingBoxIntersectJoinHashTable::codegenMatchingSet(
     auto rowid_ptr_i32 =
         LL_BUILDER.CreatePointerCast(element_ptr, llvm::Type::getInt32PtrTy(LL_CONTEXT));
 
+    const auto error_code_ptr = LL_BUILDER.CreateAlloca(
+        get_int_type(32, LL_CONTEXT), nullptr, "candidate_rows_error_code");
+    LL_BUILDER.CreateStore(LL_INT(int32_t(0)), error_code_ptr);
+
     const auto candidate_count_lv = executor_->cgen_state_->emitExternalCall(
         "get_candidate_rows",
         llvm::Type::getInt64Ty(LL_CONTEXT),
-        {
-            rowid_ptr_i32,
-            LL_INT(max_array_size),
-            many_to_many_args[1],
-            LL_INT(0),
-            LL_FP(inverse_bucket_sizes_for_dimension_[0]),
-            LL_FP(inverse_bucket_sizes_for_dimension_[1]),
-            many_to_many_args[0],
-            LL_INT(key_component_count),               // key_component_count
-            composite_key_dict,                        // ptr to hash table
-            LL_INT(getEntryCount()),                   // entry_count
-            LL_INT(composite_key_dict_size),           // offset_buffer_ptr_offset
-            LL_INT(getEntryCount() * sizeof(int32_t))  // sub_buff_size
-        });
+        {rowid_ptr_i32,
+         error_code_ptr,
+         LL_INT(kMaxBBoxOverlapsCount),
+         many_to_many_args[1],
+         LL_INT(0),
+         LL_FP(inverse_bucket_sizes_for_dimension_[0]),
+         LL_FP(inverse_bucket_sizes_for_dimension_[1]),
+         many_to_many_args[0],
+         LL_INT(key_component_count),                // key_component_count
+         composite_key_dict,                         // ptr to hash table
+         LL_INT(getEntryCount()),                    // entry_count
+         LL_INT(composite_key_dict_size),            // offset_buffer_ptr_offset
+         LL_INT(getEntryCount() * sizeof(int32_t)),  // sub_buff_size
+         LL_INT(int32_t(heavyai::ErrorCode::BBOX_OVERLAPS_LIMIT_EXCEEDED))});
 
     const auto slot_lv = LL_INT(int64_t(0));
-
-    return {rowid_ptr_i32, candidate_count_lv, slot_lv};
+    auto error_code_lv = LL_BUILDER.CreateLoad(
+        error_code_ptr->getType()->getPointerElementType(), error_code_ptr);
+    return {rowid_ptr_i32, candidate_count_lv, slot_lv, error_code_lv};
   } else {
     VLOG(1) << "Building codegenMatchingSet for Baseline";
     // TODO: duplicated w/ BaselineJoinHashTable -- push into the hash table builder?
diff --git a/QueryEngine/JoinHashTable/BoundingBoxIntersectJoinHashTable.h b/QueryEngine/JoinHashTable/BoundingBoxIntersectJoinHashTable.h
index c91c33da5c..152d871e23 100644
--- a/QueryEngine/JoinHashTable/BoundingBoxIntersectJoinHashTable.h
+++ b/QueryEngine/JoinHashTable/BoundingBoxIntersectJoinHashTable.h
@@ -21,6 +21,11 @@
 #include "QueryEngine/JoinHashTable/BaselineJoinHashTable.h"
 #include "QueryEngine/JoinHashTable/HashJoin.h"
 
+// NOTE(jclay): A fixed array of size 200 is allocated on the stack.
+// this is likely the maximum value we can do that is safe to use across
+// all supported GPU architectures.
+constexpr int32_t kMaxBBoxOverlapsCount{200};
+
 class BoundingBoxIntersectJoinHashTable : public HashJoin {
  public:
   BoundingBoxIntersectJoinHashTable(const std::shared_ptr<Analyzer::BinOper> condition,
diff --git a/QueryEngine/JoinHashTable/HashJoin.cpp b/QueryEngine/JoinHashTable/HashJoin.cpp
index 1e756dd0c6..f7cd18d51d 100644
--- a/QueryEngine/JoinHashTable/HashJoin.cpp
+++ b/QueryEngine/JoinHashTable/HashJoin.cpp
@@ -253,7 +253,7 @@ HashJoinMatchingSet HashJoin::codegenMatchingSet(
       rowid_base_i32->getType()->getScalarType()->getPointerElementType(),
       rowid_base_i32,
       slot_lv);
-  return {rowid_ptr_i32, row_count_lv, slot_lv};
+  return {rowid_ptr_i32, row_count_lv, slot_lv, nullptr};
 }
 
 llvm::Value* HashJoin::codegenHashTableLoad(const size_t table_idx, Executor* executor) {
diff --git a/QueryEngine/JoinHashTable/HashJoin.h b/QueryEngine/JoinHashTable/HashJoin.h
index 52bee7639f..3ac95b3e6a 100644
--- a/QueryEngine/JoinHashTable/HashJoin.h
+++ b/QueryEngine/JoinHashTable/HashJoin.h
@@ -121,6 +121,7 @@ struct HashJoinMatchingSet {
   llvm::Value* elements;
   llvm::Value* count;
   llvm::Value* slot;
+  llvm::Value* error_code;
 };
 
 struct CompositeKeyInfo {
diff --git a/QueryEngine/JoinHashTable/Runtime/JoinHashTableQueryRuntime.cpp b/QueryEngine/JoinHashTable/Runtime/JoinHashTableQueryRuntime.cpp
index 5710f1e91c..4cf69151dd 100644
--- a/QueryEngine/JoinHashTable/Runtime/JoinHashTableQueryRuntime.cpp
+++ b/QueryEngine/JoinHashTable/Runtime/JoinHashTableQueryRuntime.cpp
@@ -288,6 +288,7 @@ struct Bounds {
 /// The number of row ids in this array is returned.
 extern "C" RUNTIME_EXPORT NEVER_INLINE DEVICE int64_t
 get_candidate_rows(int32_t* out_arr,
+                   int32_t* error_code,
                    const uint32_t max_arr_size,
                    const int8_t* range_bytes,
                    const int32_t range_component_index,
@@ -298,7 +299,8 @@ get_candidate_rows(int32_t* out_arr,
                    int64_t* hash_table_ptr,
                    const int64_t entry_count,
                    const int64_t offset_buffer_ptr_offset,
-                   const int64_t sub_buff_size) {
+                   const int64_t sub_buff_size,
+                   const int32_t max_bbox_overlaps_error_code) {
   const auto range = reinterpret_cast<const double*>(range_bytes);
 
   size_t elem_count = 0;
@@ -325,7 +327,10 @@ get_candidate_rows(int32_t* out_arr,
       for (int64_t j = 0; j < buffer_range.element_count; j++) {
         const auto rowid = buffer_range.buffer[j];
         elem_count += insert_sorted(out_arr, elem_count, rowid);
-        assert(max_arr_size >= elem_count);
+        if (elem_count > max_arr_size) {
+          *error_code = max_bbox_overlaps_error_code;
+          return 0;
+        }
       }
     }
   }
diff --git a/QueryEngine/LogicalIR.cpp b/QueryEngine/LogicalIR.cpp
index 0ccca3ea0a..78ca4932cf 100644
--- a/QueryEngine/LogicalIR.cpp
+++ b/QueryEngine/LogicalIR.cpp
@@ -16,6 +16,7 @@
 
 #include "CodeGenerator.h"
 #include "Execute.h"
+#include "GeoOperators/Codegen.h"
 #include "NullableValue.h"
 
 #include <llvm/IR/MDBuilder.h>
@@ -395,7 +396,10 @@ llvm::Value* CodeGenerator::codegenIsNull(const Analyzer::UOper* uoper,
   }
   llvm::Value* operand_lv = codegen(operand, true, co).front();
   // NULL-check array or geo's coords array
-  if (ti.is_array() || ti.is_geometry()) {
+  if (ti.get_type() == kPOINT && dynamic_cast<Analyzer::GeoOperator const*>(operand)) {
+    char const* const fname = spatial_type::Codegen::pointIsNullFunctionName(ti);
+    return cgen_state_->emitCall(fname, {operand_lv});
+  } else if (ti.is_array() || ti.is_geometry()) {
     // POINT [un]compressed coord check requires custom checker and chunk iterator
     // Non-POINT NULL geographies will have a normally encoded null coord array
     auto fname =
diff --git a/QueryEngine/LoopControlFlow/JoinLoop.cpp b/QueryEngine/LoopControlFlow/JoinLoop.cpp
index 70d1d2e4a8..cb2e874fcc 100644
--- a/QueryEngine/LoopControlFlow/JoinLoop.cpp
+++ b/QueryEngine/LoopControlFlow/JoinLoop.cpp
@@ -106,9 +106,25 @@ llvm::BasicBlock* JoinLoop::codegen(
         }
         builder.CreateStore(ll_int(int64_t(0), context), iteration_counter_ptr);
         const auto iteration_domain = join_loop.iteration_domain_codegen_(iterators);
+
         const auto head_bb = llvm::BasicBlock::Create(
             context, "ub_iter_head_" + join_loop.name_, parent_func);
-        builder.CreateBr(head_bb);
+
+        if (iteration_domain.error_code) {
+          cgen_state->needs_error_check_ = true;
+          auto ub_iter_success_code = ll_int(int32_t(0), context);
+          const auto ub_iter_error_condition =
+              builder.CreateICmpEQ(iteration_domain.error_code, ub_iter_success_code);
+          auto error_bb =
+              llvm::BasicBlock::Create(context, "ub_iter_error_exit", parent_func);
+          builder.CreateCondBr(ub_iter_error_condition, head_bb, error_bb);
+
+          builder.SetInsertPoint(error_bb);
+          builder.CreateRet(iteration_domain.error_code);
+        } else {
+          builder.CreateBr(head_bb);
+        }
+
         builder.SetInsertPoint(head_bb);
         llvm::Value* iteration_counter =
             builder.CreateLoad(iteration_counter_ptr->getType()->getPointerElementType(),
diff --git a/QueryEngine/LoopControlFlow/JoinLoop.h b/QueryEngine/LoopControlFlow/JoinLoop.h
index aa2697fcbb..805bb97c59 100644
--- a/QueryEngine/LoopControlFlow/JoinLoop.h
+++ b/QueryEngine/LoopControlFlow/JoinLoop.h
@@ -47,6 +47,7 @@ struct JoinLoopDomain {
     llvm::Value* slot_lookup_result;  // for Singleton
   };
   llvm::Value* values_buffer;  // used for Set
+  llvm::Value* error_code;
 };
 
 // Any join is logically a loop. Hash joins just limit the domain of iteration,
diff --git a/QueryEngine/MLPredictCodegen.cpp b/QueryEngine/MLPredictCodegen.cpp
index 88027f6d4e..d8ba6b39f7 100644
--- a/QueryEngine/MLPredictCodegen.cpp
+++ b/QueryEngine/MLPredictCodegen.cpp
@@ -118,7 +118,6 @@ std::vector<std::shared_ptr<Analyzer::Expr>> generated_encoded_and_casted_featur
 
 llvm::Value* CodeGenerator::codegenLinRegPredict(
     const Analyzer::MLPredictExpr* expr,
-    const std::string& model_name,
     const std::shared_ptr<AbstractMLModel>& abstract_model,
     const CompilationOptions& co) {
   AUTOMATIC_IR_METADATA(cgen_state_);
@@ -181,15 +180,9 @@ llvm::Value* CodeGenerator::codegenLinRegPredict(
 
 llvm::Value* CodeGenerator::codegenTreeRegPredict(
     const Analyzer::MLPredictExpr* expr,
-    const std::string& model_name,
-    const std::shared_ptr<AbstractMLModel>& model,
+    const std::shared_ptr<AbstractTreeModel>& tree_model,
     const CompilationOptions& co) {
 #ifdef HAVE_ONEDAL
-  const auto tree_model = std::dynamic_pointer_cast<AbstractTreeModel>(model);
-  // The parent codegen function called this function `codegenTreeRegPredict`
-  // iff we a tree reg MLModelType, so below is just a sanity
-  // check
-  CHECK(tree_model);
   const int64_t num_trees = static_cast<int64_t>(tree_model->getNumTrees());
   const auto& regressor_exprs = expr->get_regressor_values();
   const auto& cat_feature_keys = tree_model->getCatFeatureKeys();
@@ -318,12 +311,19 @@ llvm::Value* CodeGenerator::codegen(const Analyzer::MLPredictExpr* expr,
 
   switch (model_type) {
     case MLModelType::LINEAR_REG: {
-      return codegenLinRegPredict(expr, model_name, abstract_model, co);
+      return codegenLinRegPredict(expr, abstract_model, co);
     }
     case MLModelType::DECISION_TREE_REG:
     case MLModelType::GBT_REG:
     case MLModelType::RANDOM_FOREST_REG: {
-      return codegenTreeRegPredict(expr, model_name, abstract_model, co);
+      if (auto tree_model =
+              std::dynamic_pointer_cast<AbstractTreeModel>(abstract_model)) {
+        return codegenTreeRegPredict(expr, tree_model, co);
+      } else {
+        throw std::runtime_error(
+            "Invalid ML model codegen call. Input model is not of expected type "
+            "TreeModel.");
+      }
     }
     default: {
       throw std::runtime_error("Unsupported model type.");
diff --git a/QueryEngine/NativeCodegen.cpp b/QueryEngine/NativeCodegen.cpp
index 2b37f573d8..41cc84f953 100644
--- a/QueryEngine/NativeCodegen.cpp
+++ b/QueryEngine/NativeCodegen.cpp
@@ -78,6 +78,8 @@ static_assert(false, "LLVM Version >= 9 is required.");
 #include "Shared/MathUtils.h"
 #include "StreamingTopN.h"
 
+using heavyai::ErrorCode;
+
 float g_fraction_code_cache_to_evict = 0.2;
 
 #ifdef ENABLE_GEOS
@@ -670,7 +672,7 @@ declare i64 @get_composite_key_index_64(i64*, i64, i64*, i64);
 declare i64 @get_bucket_key_for_range_compressed(i8*, i64, double);
 declare i64 @get_bucket_key_for_range_double(i8*, i64, double);
 declare i32 @get_num_buckets_for_bounds(i8*, i32, double, double);
-declare i64 @get_candidate_rows(i32*, i32, i8*, i32, double, double, i32, i64, i64*, i64, i64, i64);
+declare i64 @get_candidate_rows(i32*, i32*, i32, i8*, i32, double, double, i32, i64, i64*, i64, i64, i64, i32);
 declare i64 @agg_count_shared(i64*, i64);
 declare i64 @agg_count_skip_val_shared(i64*, i64, i64);
 declare i32 @agg_count_int32_shared(i32*, i32);
@@ -838,10 +840,10 @@ declare i1 @string_like(i8*, i32, i8*, i32, i8);
 declare i1 @string_ilike(i8*, i32, i8*, i32, i8);
 declare i8 @string_like_nullable(i8*, i32, i8*, i32, i8, i8);
 declare i8 @string_ilike_nullable(i8*, i32, i8*, i32, i8, i8);
-declare i1 @string_like_simple(i8*, i32, i8*, i32);
-declare i1 @string_ilike_simple(i8*, i32, i8*, i32);
-declare i8 @string_like_simple_nullable(i8*, i32, i8*, i32, i8);
-declare i8 @string_ilike_simple_nullable(i8*, i32, i8*, i32, i8);
+declare i1 @string_like_simple(i8*, i32, i8*, i32, i8);
+declare i1 @string_ilike_simple(i8*, i32, i8*, i32, i8);
+declare i8 @string_like_simple_nullable(i8*, i32, i8*, i32, i8, i8);
+declare i8 @string_ilike_simple_nullable(i8*, i32, i8*, i32, i8, i8);
 declare i1 @string_lt(i8*, i32, i8*, i32);
 declare i1 @string_le(i8*, i32, i8*, i32);
 declare i1 @string_gt(i8*, i32, i8*, i32);
@@ -1367,29 +1369,23 @@ std::shared_ptr<GpuCompilationContext> CodeGenerator::generateNativeGPUCode(
   }
   LOG(PTX) << "PTX for the GPU:\n" << ptx << "\nEnd of PTX";
 
-  auto cubin_result = ptx_to_cubin(ptx, gpu_target.cuda_mgr);
-  auto& option_keys = cubin_result.option_keys;
-  auto& option_values = cubin_result.option_values;
-  auto cubin = cubin_result.cubin;
-  auto link_state = cubin_result.link_state;
-  const auto num_options = option_keys.size();
-
+  CubinResult cubin_result = ptx_to_cubin(ptx, gpu_target.cuda_mgr);
   auto func_name = wrapper_func->getName().str();
   auto gpu_compilation_context = std::make_shared<GpuCompilationContext>();
   for (int device_id = 0; device_id < gpu_target.cuda_mgr->getDeviceCount();
        ++device_id) {
     gpu_compilation_context->addDeviceCode(
-        std::make_unique<GpuDeviceCompilationContext>(cubin,
+        std::make_unique<GpuDeviceCompilationContext>(cubin_result.cubin,
                                                       cubin_result.cubin_size,
                                                       func_name,
                                                       device_id,
                                                       gpu_target.cuda_mgr,
-                                                      num_options,
-                                                      &option_keys[0],
-                                                      &option_values[0]));
+                                                      cubin_result.option_keys.size(),
+                                                      cubin_result.option_keys.data(),
+                                                      cubin_result.option_values.data()));
   }
 
-  checkCudaErrors(cuLinkDestroy(link_state));
+  checkCudaErrors(cuLinkDestroy(cubin_result.link_state));
   return gpu_compilation_context;
 #else
   return {};
@@ -2129,7 +2125,9 @@ void Executor::createErrorCheckControlFlow(
           auto detected_timeout = watchdog_ir_builder.CreateCall(
               cgen_state_->module_->getFunction("dynamic_watchdog"), {});
           auto timeout_err_lv = watchdog_ir_builder.CreateSelect(
-              detected_timeout, cgen_state_->llInt(Executor::ERR_OUT_OF_TIME), err_lv);
+              detected_timeout,
+              cgen_state_->llInt(int32_t(ErrorCode::OUT_OF_TIME)),
+              err_lv);
           watchdog_ir_builder.CreateBr(error_check_bb);
 
           llvm::ReplaceInstWithInst(
@@ -2166,7 +2164,7 @@ void Executor::createErrorCheckControlFlow(
                 cgen_state_->module_->getFunction("check_interrupt"), {});
             interrupt_err_lv = interrupt_checker_ir_builder.CreateSelect(
                 detected_interrupt,
-                cgen_state_->llInt(Executor::ERR_INTERRUPTED),
+                cgen_state_->llInt(int32_t(ErrorCode::INTERRUPTED)),
                 err_lv);
             interrupt_checker_ir_builder.CreateBr(error_check_bb);
           };
@@ -2274,9 +2272,10 @@ void Executor::createErrorCheckControlFlow(
           // let kernel execution finish as expected, regardless of the observed error,
           // unless it is from the dynamic watchdog where all threads within that block
           // return together.
-          err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
-                                         err_lv,
-                                         cgen_state_->llInt(Executor::ERR_OUT_OF_TIME));
+          err_lv =
+              ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
+                                    err_lv,
+                                    cgen_state_->llInt(int32_t(ErrorCode::OUT_OF_TIME)));
         } else {
           err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_NE,
                                          err_lv,
@@ -3279,7 +3278,7 @@ void Executor::insertErrorCodeChecker(llvm::Function* query_func,
               std::vector<llvm::Value*>{error_code_arg});
           err_code = interrupt_checker_ir_builder.CreateSelect(
               detected_interrupt,
-              cgen_state_->llInt(Executor::ERR_INTERRUPTED),
+              cgen_state_->llInt(int32_t(ErrorCode::INTERRUPTED)),
               detected_error);
           interrupt_checker_ir_builder.CreateBr(error_check_bb);
           llvm::ReplaceInstWithInst(&check_interrupt_br_instr,
diff --git a/QueryEngine/NvidiaKernel.cpp b/QueryEngine/NvidiaKernel.cpp
index 3983e03eb9..fc993bd88f 100644
--- a/QueryEngine/NvidiaKernel.cpp
+++ b/QueryEngine/NvidiaKernel.cpp
@@ -14,43 +14,48 @@
  * limitations under the License.
  */
 
-#include <sstream>
-
 #include "NvidiaKernel.h"
-
 #include "Logger/Logger.h"
 #include "OSDependent/heavyai_path.h"
 
 #include <boost/filesystem/operations.hpp>
 
+#include <sstream>
+
 #ifdef HAVE_CUDA
-namespace {
 
-#define JIT_LOG_SIZE 8192
-
-void fill_options(std::vector<CUjit_option>& option_keys,
-                  std::vector<void*>& option_values,
-                  char* info_log,
-                  char* error_log) {
-  option_keys.push_back(CU_JIT_LOG_VERBOSE);
-  option_values.push_back(reinterpret_cast<void*>(1));
-  option_keys.push_back(CU_JIT_THREADS_PER_BLOCK);
-  // fix the minimum # threads per block to the hardware-limit maximum num threads
-  // to avoid recompiling jit module even if we manipulate it via query hint
-  // (and allowed `CU_JIT_THREADS_PER_BLOCK` range is between 1 and 1024 by query hint)
-  option_values.push_back(reinterpret_cast<void*>(1024));
-  option_keys.push_back(CU_JIT_WALL_TIME);
-  option_values.push_back(reinterpret_cast<void*>(0));
-  option_keys.push_back(CU_JIT_INFO_LOG_BUFFER);
-  option_values.push_back(reinterpret_cast<void*>(info_log));
-  option_keys.push_back(CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES);
-  option_values.push_back(reinterpret_cast<void*>((long)JIT_LOG_SIZE));
-  option_keys.push_back(CU_JIT_ERROR_LOG_BUFFER);
-  option_values.push_back(reinterpret_cast<void*>(error_log));
-  option_keys.push_back(CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES);
-  option_values.push_back(reinterpret_cast<void*>((long)JIT_LOG_SIZE));
+CubinResult::CubinResult()
+    : cubin(nullptr), link_state(CUlinkState{}), cubin_size(0u), jit_wall_time_idx(0u) {
+  constexpr size_t JIT_LOG_SIZE = 8192u;
+  static_assert(0u < JIT_LOG_SIZE);
+  info_log.resize(JIT_LOG_SIZE - 1u);  // minus 1 for null terminator
+  error_log.resize(JIT_LOG_SIZE - 1u);
+  std::pair<CUjit_option, void*> options[] = {
+      {CU_JIT_LOG_VERBOSE, reinterpret_cast<void*>(1)},
+      // fix the minimum # threads per block to the hardware-limit maximum num threads to
+      // avoid recompiling jit module even if we manipulate it via query hint (and allowed
+      // `CU_JIT_THREADS_PER_BLOCK` range is between 1 and 1024 by query hint)
+      {CU_JIT_THREADS_PER_BLOCK, reinterpret_cast<void*>(1024)},
+      {CU_JIT_WALL_TIME, nullptr},  // input not read, only output
+      {CU_JIT_INFO_LOG_BUFFER, reinterpret_cast<void*>(&info_log[0])},
+      {CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, reinterpret_cast<void*>(JIT_LOG_SIZE)},
+      {CU_JIT_ERROR_LOG_BUFFER, reinterpret_cast<void*>(&error_log[0])},
+      {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, reinterpret_cast<void*>(JIT_LOG_SIZE)}};
+  constexpr size_t n_options = sizeof(options) / sizeof(*options);
+  option_keys.reserve(n_options);
+  option_values.reserve(n_options);
+  for (size_t i = 0; i < n_options; ++i) {
+    option_keys.push_back(options[i].first);
+    option_values.push_back(options[i].second);
+    if (options[i].first == CU_JIT_WALL_TIME) {
+      jit_wall_time_idx = i;
+    }
+  }
+  CHECK_EQ(CU_JIT_WALL_TIME, option_keys[jit_wall_time_idx]) << jit_wall_time_idx;
 }
 
+namespace {
+
 boost::filesystem::path get_gpu_rt_path() {
   boost::filesystem::path gpu_rt_path{heavyai::get_root_abs_path()};
   gpu_rt_path /= "QueryEngine";
@@ -77,38 +82,38 @@ boost::filesystem::path get_cuda_table_functions_path() {
 }  // namespace
 
 void nvidia_jit_warmup() {
-  std::vector<CUjit_option> option_keys;
-  std::vector<void*> option_values;
-  char info_log[JIT_LOG_SIZE];
-  char error_log[JIT_LOG_SIZE];
-  fill_options(option_keys, option_values, info_log, error_log);
-  CHECK_EQ(option_values.size(), option_keys.size());
-  unsigned num_options = option_keys.size();
-  CUlinkState link_state;
-  checkCudaErrors(
-      cuLinkCreate(num_options, &option_keys[0], &option_values[0], &link_state))
-      << ": " << std::string(error_log);
-  VLOG(1) << "CUDA JIT time to create link: "
-          << *reinterpret_cast<float*>(&option_values[2]);
+  CubinResult cubin_result{};
+  CHECK_EQ(cubin_result.option_values.size(), cubin_result.option_keys.size());
+  unsigned const num_options = cubin_result.option_keys.size();
+  checkCudaErrors(cuLinkCreate(num_options,
+                               cubin_result.option_keys.data(),
+                               cubin_result.option_values.data(),
+                               &cubin_result.link_state))
+      << ": " << cubin_result.error_log.c_str();
+  VLOG(1) << "CUDA JIT time to create link: " << cubin_result.jitWallTime();
   boost::filesystem::path gpu_rt_path = get_gpu_rt_path();
   boost::filesystem::path cuda_table_functions_path = get_cuda_table_functions_path();
   CHECK(!gpu_rt_path.empty());
   CHECK(!cuda_table_functions_path.empty());
-  checkCudaErrors(cuLinkAddFile(
-      link_state, CU_JIT_INPUT_FATBINARY, gpu_rt_path.c_str(), 0, nullptr, nullptr))
-      << ": " << std::string(error_log);
-  VLOG(1) << "CUDA JIT time to add RT fatbinary: "
-          << *reinterpret_cast<float*>(&option_values[2]);
-  checkCudaErrors(cuLinkAddFile(link_state,
+  checkCudaErrors(cuLinkAddFile(cubin_result.link_state,
+                                CU_JIT_INPUT_FATBINARY,
+                                gpu_rt_path.c_str(),
+                                0,
+                                nullptr,
+                                nullptr))
+      << ": " << cubin_result.error_log.c_str();
+  VLOG(1) << "CUDA JIT time to add RT fatbinary: " << cubin_result.jitWallTime();
+  checkCudaErrors(cuLinkAddFile(cubin_result.link_state,
                                 CU_JIT_INPUT_LIBRARY,
                                 cuda_table_functions_path.c_str(),
                                 0,
                                 nullptr,
                                 nullptr))
-      << ": " << std::string(error_log);
+      << ": " << cubin_result.error_log.c_str();
   VLOG(1) << "CUDA JIT time to add GPU table functions library: "
-          << *reinterpret_cast<float*>(&option_values[2]);
-  checkCudaErrors(cuLinkDestroy(link_state)) << ": " << std::string(error_log);
+          << cubin_result.jitWallTime();
+  checkCudaErrors(cuLinkDestroy(cubin_result.link_state))
+      << ": " << cubin_result.error_log.c_str();
 }
 
 std::string add_line_numbers(const std::string& text) {
@@ -130,19 +135,14 @@ CubinResult ptx_to_cubin(const std::string& ptx,
   CHECK(!ptx.empty());
   CHECK(cuda_mgr && cuda_mgr->getDeviceCount() > 0);
   cuda_mgr->setContext(0);
-  std::vector<CUjit_option> option_keys;
-  std::vector<void*> option_values;
-  char info_log[JIT_LOG_SIZE];
-  char error_log[JIT_LOG_SIZE];
-  fill_options(option_keys, option_values, info_log, error_log);
-  CHECK_EQ(option_values.size(), option_keys.size());
-  unsigned num_options = option_keys.size();
-  CUlinkState link_state;
-  checkCudaErrors(
-      cuLinkCreate(num_options, &option_keys[0], &option_values[0], &link_state))
-      << ": " << std::string(error_log);
-  VLOG(1) << "CUDA JIT time to create link: "
-          << *reinterpret_cast<float*>(&option_values[2]);
+  CubinResult cubin_result{};
+  CHECK_EQ(cubin_result.option_values.size(), cubin_result.option_keys.size());
+  checkCudaErrors(cuLinkCreate(cubin_result.option_keys.size(),
+                               cubin_result.option_keys.data(),
+                               cubin_result.option_values.data(),
+                               &cubin_result.link_state))
+      << ": " << cubin_result.error_log.c_str();
+  VLOG(1) << "CUDA JIT time to create link: " << cubin_result.jitWallTime();
 
   boost::filesystem::path gpu_rt_path = get_gpu_rt_path();
   boost::filesystem::path cuda_table_functions_path = get_cuda_table_functions_path();
@@ -152,21 +152,26 @@ CubinResult ptx_to_cubin(const std::string& ptx,
   // 1. nvcc -std=c++11 -arch=sm_35 --device-link -c [list of .cu files]
   // 2. nvcc -std=c++11 -arch=sm_35 -lib [list of .o files generated by step 1] -o
   // [library_name.a]
-  checkCudaErrors(cuLinkAddFile(
-      link_state, CU_JIT_INPUT_FATBINARY, gpu_rt_path.c_str(), 0, nullptr, nullptr))
-      << ": " << std::string(error_log);
-  VLOG(1) << "CUDA JIT time to add RT fatbinary: "
-          << *reinterpret_cast<float*>(&option_values[2]);
-  checkCudaErrors(cuLinkAddFile(link_state,
+  checkCudaErrors(cuLinkAddFile(cubin_result.link_state,
+                                CU_JIT_INPUT_FATBINARY,
+                                gpu_rt_path.c_str(),
+                                0,
+                                nullptr,
+                                nullptr))
+      << ": " << cubin_result.error_log.c_str();
+  VLOG(1) << "CUDA JIT time to add RT fatbinary: " << cubin_result.jitWallTime();
+  checkCudaErrors(cuLinkAddFile(cubin_result.link_state,
                                 CU_JIT_INPUT_LIBRARY,
                                 cuda_table_functions_path.c_str(),
                                 0,
                                 nullptr,
                                 nullptr))
-      << ": " << std::string(error_log);
+      << ": " << cubin_result.error_log.c_str();
   VLOG(1) << "CUDA JIT time to add GPU table functions library: "
-          << *reinterpret_cast<float*>(&option_values[2]);
-  checkCudaErrors(cuLinkAddData(link_state,
+          << cubin_result.jitWallTime();
+  // The ptx.length() + 1 follows the example in
+  // https://developer.nvidia.com/blog/discovering-new-features-in-cuda-11-4/
+  checkCudaErrors(cuLinkAddData(cubin_result.link_state,
                                 CU_JIT_INPUT_PTX,
                                 static_cast<void*>(const_cast<char*>(ptx.c_str())),
                                 ptx.length() + 1,
@@ -174,23 +179,19 @@ CubinResult ptx_to_cubin(const std::string& ptx,
                                 0,
                                 nullptr,
                                 nullptr))
-      << ": " << std::string(error_log) << "\nPTX:\n"
+      << ": " << cubin_result.error_log.c_str() << "\nPTX:\n"
       << add_line_numbers(ptx) << "\nEOF PTX";
-  VLOG(1) << "CUDA JIT time to add generated code: "
-          << *reinterpret_cast<float*>(&option_values[2]);
-  void* cubin{nullptr};
-  size_t cubinSize{0};
-  checkCudaErrors(cuLinkComplete(link_state, &cubin, &cubinSize))
-      << ": " << std::string(error_log);
-  VLOG(1) << "CUDA Linker completed: " << info_log;
-  CHECK(cubin);
-  CHECK_GT(cubinSize, size_t(0));
-  VLOG(1) << "Generated GPU binary code size: " << cubinSize << " bytes";
-  return {cubin, option_keys, option_values, link_state, cubinSize};
+  VLOG(1) << "CUDA JIT time to add generated code: " << cubin_result.jitWallTime();
+  checkCudaErrors(cuLinkComplete(
+      cubin_result.link_state, &cubin_result.cubin, &cubin_result.cubin_size))
+      << ": " << cubin_result.error_log.c_str();
+  VLOG(1) << "CUDA Linker completed: " << cubin_result.info_log.c_str();
+  CHECK(cubin_result.cubin);
+  CHECK_LT(0u, cubin_result.cubin_size);
+  VLOG(1) << "Generated GPU binary code size: " << cubin_result.cubin_size << " bytes";
+  return cubin_result;
 }
-#endif
 
-#ifdef HAVE_CUDA
 GpuDeviceCompilationContext::GpuDeviceCompilationContext(const void* image,
                                                          const size_t module_size,
                                                          const std::string& kernel_name,
diff --git a/QueryEngine/NvidiaKernel.h b/QueryEngine/NvidiaKernel.h
index 3eb68b9639..f47bac2dd7 100644
--- a/QueryEngine/NvidiaKernel.h
+++ b/QueryEngine/NvidiaKernel.h
@@ -33,6 +33,15 @@ struct CubinResult {
   std::vector<void*> option_values;
   CUlinkState link_state;
   size_t cubin_size;
+
+  std::string info_log;
+  std::string error_log;
+  size_t jit_wall_time_idx;
+
+  CubinResult();
+  inline float jitWallTime() const {
+    return *reinterpret_cast<float const*>(&option_values[jit_wall_time_idx]);
+  }
 };
 
 /**
diff --git a/QueryEngine/QueryHint.h b/QueryEngine/QueryHint.h
index 9a305c27a6..0e9747cda7 100644
--- a/QueryEngine/QueryHint.h
+++ b/QueryEngine/QueryHint.h
@@ -54,6 +54,8 @@ enum QueryHint {
   kforceOneToManyHashJoin,
   kWatchdogMaxProjectedRowsPerDevice,
   kPreflightCountQueryThreshold,
+  kTableReorderingOff,
+  kNDVGroupsEstimatorMultiplier,
   kHintCount,   // should be at the last elem before INVALID enum value to count #
                 // supported hints correctly
   kInvalidHint  // this should be the last elem of this enum
@@ -87,7 +89,9 @@ static const std::unordered_map<std::string, QueryHint> SupportedQueryHints = {
     {"force_one_to_many_hash_join", QueryHint::kforceOneToManyHashJoin},
     {"watchdog_max_projected_rows_per_device",
      QueryHint::kWatchdogMaxProjectedRowsPerDevice},
-    {"preflight_count_query_threshold", QueryHint::kPreflightCountQueryThreshold}};
+    {"preflight_count_query_threshold", QueryHint::kPreflightCountQueryThreshold},
+    {"table_reordering_off", QueryHint::kTableReorderingOff},
+    {"ndv_groups_estimator_multiplier", QueryHint::kNDVGroupsEstimatorMultiplier}};
 
 struct HintIdentifier {
   bool global_hint;
@@ -202,6 +206,8 @@ struct RegisteredQueryHint {
       , query_time_limit(0)
       , watchdog_max_projected_rows_per_device(g_watchdog_max_projected_rows_per_device)
       , preflight_count_query_threshold(g_preflight_count_query_threshold)
+      , table_reordering_off(false)
+      , ndv_groups_estimator_multiplier(2.0)
       , cuda_block_size(0)
       , cuda_grid_size_multiplier(0.0)
       , opt_cuda_grid_and_block_size(false)
@@ -316,6 +322,13 @@ struct RegisteredQueryHint {
             updated_query_hints.preflight_count_query_threshold =
                 global_hints.preflight_count_query_threshold;
             break;
+          case QueryHint::kTableReorderingOff:
+            updated_query_hints.table_reordering_off = global_hints.table_reordering_off;
+            break;
+          case QueryHint::kNDVGroupsEstimatorMultiplier:
+            updated_query_hints.ndv_groups_estimator_multiplier =
+                global_hints.ndv_groups_estimator_multiplier;
+            break;
           default:
             UNREACHABLE();
         }
@@ -335,6 +348,8 @@ struct RegisteredQueryHint {
   size_t query_time_limit;
   size_t watchdog_max_projected_rows_per_device;
   size_t preflight_count_query_threshold;
+  bool table_reordering_off;
+  double ndv_groups_estimator_multiplier;
 
   // control CUDA behavior
   size_t cuda_block_size;
diff --git a/QueryEngine/QueryMemoryInitializer.cpp b/QueryEngine/QueryMemoryInitializer.cpp
index 62272cdf81..ce265867b4 100644
--- a/QueryEngine/QueryMemoryInitializer.cpp
+++ b/QueryEngine/QueryMemoryInitializer.cpp
@@ -29,6 +29,14 @@ int64_t g_bitmap_memory_limit{8LL * 1000 * 1000 * 1000};
 
 namespace {
 
+struct AddNbytes {
+  size_t const entry_count;
+  size_t operator()(size_t const sum, ApproxQuantileDescriptor const aqd) const {
+    return sum +
+           entry_count * quantile::TDigest::nbytes(aqd.buffer_size, aqd.centroids_size);
+  }
+};
+
 inline void check_total_bitmap_memory(const QueryMemoryDescriptor& query_mem_desc) {
   const size_t groups_buffer_entry_count = query_mem_desc.getEntryCount();
   checked_int64_t total_bytes_per_group = 0;
@@ -262,6 +270,28 @@ QueryMemoryInitializer::QueryMemoryInitializer(
     if (device_type == ExecutorDeviceType::GPU) {
       allocateCountDistinctGpuMem(query_mem_desc);
     }
+    agg_op_metadata.count_distinct_buf_size =
+        calculateCountDistinctBufferSize(query_mem_desc, ra_exe_unit);
+    size_t total_buffer_size{0};
+    for (auto buffer_size : agg_op_metadata.count_distinct_buf_size) {
+      if (buffer_size > 0) {
+        total_buffer_size += buffer_size;
+      }
+    }
+    total_buffer_size *= query_mem_desc.getEntryCount();
+    row_set_mem_owner_->initCountDistinctBufferAllocator(total_buffer_size, thread_idx_);
+  }
+
+  if (agg_op_metadata.has_tdigest) {
+    auto const& descs = query_mem_desc.getApproxQuantileDescriptors();
+    // Pre-allocate all TDigest memory for this thread.
+    AddNbytes const add_nbytes{query_mem_desc.getEntryCount()};
+    size_t const capacity =
+        std::accumulate(descs.begin(), descs.end(), size_t(0), add_nbytes);
+    VLOG(2) << "row_set_mem_owner_->reserveTDigestMemory(" << thread_idx_ << ','
+            << capacity << ") query_mem_desc.getEntryCount()("
+            << query_mem_desc.getEntryCount() << ')';
+    row_set_mem_owner_->reserveTDigestMemory(thread_idx_, capacity);
   }
 
   if (render_allocator_map || !query_mem_desc.isGroupBy()) {
@@ -280,16 +310,12 @@ QueryMemoryInitializer::QueryMemoryInitializer(
   }
 
   if (query_mem_desc.isGroupBy()) {
-    if (agg_op_metadata.has_count_distinct) {
-      agg_op_metadata.count_distinct_buf_size =
-          calculateCountDistinctBufferSize(query_mem_desc, ra_exe_unit);
-    }
     if (agg_op_metadata.has_mode) {
       agg_op_metadata.mode_index_set =
           initializeModeIndexSet(query_mem_desc, ra_exe_unit);
     }
     if (agg_op_metadata.has_tdigest) {
-      agg_op_metadata.qualtile_params =
+      agg_op_metadata.quantile_params =
           initializeQuantileParams(query_mem_desc, ra_exe_unit);
     }
   }
@@ -320,6 +346,7 @@ QueryMemoryInitializer::QueryMemoryInitializer(
 
   const auto group_buffers_count = !query_mem_desc.isGroupBy() ? 1 : num_buffers_;
   int64_t* group_by_buffer_template{nullptr};
+
   if (!query_mem_desc.lazyInitGroups(device_type) && group_buffers_count > 1) {
     group_by_buffer_template = reinterpret_cast<int64_t*>(
         row_set_mem_owner_->allocate(group_buffer_size, thread_idx_));
@@ -593,9 +620,9 @@ void QueryMemoryInitializer::initRowGroups(const QueryMemoryDescriptor& query_me
   auto buffer_ptr = reinterpret_cast<int8_t*>(groups_buffer);
   const auto query_mem_desc_fixedup =
       ResultSet::fixupQueryMemoryDescriptor(query_mem_desc);
+  auto const key_sz = query_mem_desc.getEffectiveKeyWidth();
   // not COUNT DISTINCT / APPROX_COUNT_DISTINCT / APPROX_QUANTILE
   // we use the default implementation in those agg ops
-  auto const key_sz = query_mem_desc.getEffectiveKeyWidth();
   if (!(agg_op_metadata.has_count_distinct || agg_op_metadata.has_mode ||
         agg_op_metadata.has_tdigest) &&
       g_optimize_row_initialization) {
@@ -747,27 +774,29 @@ void QueryMemoryInitializer::initColumnsPerRow(
     const TargetAggOpsMetadata& agg_op_metadata) {
   int8_t* col_ptr = row_ptr;
   size_t init_vec_idx = 0;
+  size_t approx_quantile_descriptors_idx = 0;
   for (size_t col_idx = 0; col_idx < query_mem_desc.getSlotCount();
        col_ptr += query_mem_desc.getNextColOffInBytesRowOnly(col_ptr, col_idx++)) {
     int64_t init_val{0};
     if (query_mem_desc.isGroupBy()) {
-      if (agg_op_metadata.has_count_distinct) {
+      if (agg_op_metadata.has_count_distinct &&
+          agg_op_metadata.count_distinct_buf_size[col_idx]) {
         // COUNT DISTINCT / APPROX_COUNT_DISTINCT
         // create a data structure for count_distinct operator per entries
         const int64_t bm_sz{agg_op_metadata.count_distinct_buf_size[col_idx]};
-        if (bm_sz) {
-          CHECK_EQ(static_cast<size_t>(query_mem_desc.getPaddedSlotWidthBytes(col_idx)),
-                   sizeof(int64_t));
-          init_val =
-              bm_sz > 0 ? allocateCountDistinctBitmap(bm_sz) : allocateCountDistinctSet();
-          CHECK_NE(init_val, 0);
-          ++init_vec_idx;
-        }
+        CHECK_EQ(static_cast<size_t>(query_mem_desc.getPaddedSlotWidthBytes(col_idx)),
+                 sizeof(int64_t));
+        init_val =
+            bm_sz > 0 ? allocateCountDistinctBitmap(bm_sz) : allocateCountDistinctSet();
+        CHECK_NE(init_val, 0);
+        ++init_vec_idx;
       } else if (agg_op_metadata.has_tdigest &&
-                 agg_op_metadata.qualtile_params[col_idx]) {
-        auto const q = *agg_op_metadata.qualtile_params[col_idx];
-        // allocate for APPROX_QUANTILE only when slot is used
-        init_val = reinterpret_cast<int64_t>(row_set_mem_owner_->nullTDigest(q));
+                 agg_op_metadata.quantile_params[col_idx]) {
+        auto const q = *agg_op_metadata.quantile_params[col_idx];
+        auto const& descs = query_mem_desc.getApproxQuantileDescriptors();
+        auto const& desc = descs.at(approx_quantile_descriptors_idx++);
+        init_val = reinterpret_cast<int64_t>(
+            row_set_mem_owner_->initTDigest(thread_idx_, desc, q));
         CHECK_NE(init_val, 0);
         ++init_vec_idx;
       } else if (agg_op_metadata.has_mode &&
@@ -899,39 +928,18 @@ int64_t QueryMemoryInitializer::allocateCountDistinctSet() {
   return reinterpret_cast<int64_t>(count_distinct_set);
 }
 
-namespace {
-
-void eachAggregateTargetIdxOfType(
-    std::vector<Analyzer::Expr*> const& target_exprs,
-    SQLAgg const agg_type,
-    std::function<void(Analyzer::AggExpr const*, size_t)> lambda) {
-  for (size_t target_idx = 0; target_idx < target_exprs.size(); ++target_idx) {
-    auto const target_expr = target_exprs[target_idx];
-    if (auto const* agg_expr = dynamic_cast<Analyzer::AggExpr const*>(target_expr)) {
-      if (agg_expr->get_aggtype() == agg_type) {
-        lambda(agg_expr, target_idx);
-      }
-    }
-  }
-}
-
-}  // namespace
-
 QueryMemoryInitializer::ModeIndexSet QueryMemoryInitializer::initializeModeIndexSet(
     const QueryMemoryDescriptor& query_mem_desc,
     const RelAlgExecutionUnit& ra_exe_unit) {
   size_t const slot_count = query_mem_desc.getSlotCount();
   CHECK_LE(ra_exe_unit.target_exprs.size(), slot_count);
   ModeIndexSet mode_index_set;
-  eachAggregateTargetIdxOfType(
-      ra_exe_unit.target_exprs,
-      kMODE,
-      [&](Analyzer::AggExpr const*, size_t const target_idx) {
-        size_t const agg_col_idx =
-            query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
-        CHECK_LT(agg_col_idx, slot_count);
-        mode_index_set.emplace(agg_col_idx);
-      });
+  ra_exe_unit.eachAggTarget<kMODE>([&](Analyzer::AggExpr const*,
+                                       size_t const target_idx) {
+    size_t const agg_col_idx = query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
+    CHECK_LT(agg_col_idx, slot_count);
+    mode_index_set.emplace(agg_col_idx);
+  });
   return mode_index_set;
 }
 
@@ -940,16 +948,13 @@ void QueryMemoryInitializer::allocateModeBuffer(
     const RelAlgExecutionUnit& ra_exe_unit) {
   size_t const slot_count = query_mem_desc.getSlotCount();
   CHECK_LE(ra_exe_unit.target_exprs.size(), slot_count);
-  eachAggregateTargetIdxOfType(
-      ra_exe_unit.target_exprs,
-      kMODE,
-      [&](Analyzer::AggExpr const*, size_t const target_idx) {
-        size_t const agg_col_idx =
-            query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
-        CHECK_LT(agg_col_idx, slot_count);
-        AggMode* agg_mode = row_set_mem_owner_->allocateMode();
-        init_agg_vals_[agg_col_idx] = reinterpret_cast<int64_t>(agg_mode);
-      });
+  ra_exe_unit.eachAggTarget<kMODE>([&](Analyzer::AggExpr const*,
+                                       size_t const target_idx) {
+    size_t const agg_col_idx = query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
+    CHECK_LT(agg_col_idx, slot_count);
+    AggMode* agg_mode = row_set_mem_owner_->allocateMode();
+    init_agg_vals_[agg_col_idx] = reinterpret_cast<int64_t>(agg_mode);
+  });
 }
 
 std::vector<QueryMemoryInitializer::QuantileParam>
@@ -959,20 +964,17 @@ QueryMemoryInitializer::initializeQuantileParams(
   size_t const slot_count = query_mem_desc.getSlotCount();
   CHECK_LE(ra_exe_unit.target_exprs.size(), slot_count);
   std::vector<QuantileParam> quantile_params(slot_count);
-  eachAggregateTargetIdxOfType(
-      ra_exe_unit.target_exprs,
-      kAPPROX_QUANTILE,
-      [&](Analyzer::AggExpr const* const agg_expr, size_t const target_idx) {
-        size_t const agg_col_idx =
-            query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
-        CHECK_LT(agg_col_idx, slot_count);
-        CHECK_EQ(static_cast<int8_t>(sizeof(int64_t)),
-                 query_mem_desc.getLogicalSlotWidthBytes(agg_col_idx));
-        auto const q_expr =
-            dynamic_cast<Analyzer::Constant const*>(agg_expr->get_arg1().get());
-        CHECK(q_expr);
-        quantile_params[agg_col_idx] = q_expr->get_constval().doubleval;
-      });
+  ra_exe_unit.eachAggTarget<kAPPROX_QUANTILE>([&](Analyzer::AggExpr const* const agg_expr,
+                                                  size_t const target_idx) {
+    size_t const agg_col_idx = query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
+    CHECK_LT(agg_col_idx, slot_count);
+    CHECK_EQ(static_cast<int8_t>(sizeof(int64_t)),
+             query_mem_desc.getLogicalSlotWidthBytes(agg_col_idx));
+    auto const q_expr =
+        dynamic_cast<Analyzer::Constant const*>(agg_expr->get_arg1().get());
+    CHECK(q_expr);
+    quantile_params[agg_col_idx] = q_expr->get_constval().doubleval;
+  });
   return quantile_params;
 }
 
@@ -981,23 +983,23 @@ void QueryMemoryInitializer::allocateTDigestsBuffer(
     const RelAlgExecutionUnit& ra_exe_unit) {
   size_t const slot_count = query_mem_desc.getSlotCount();
   CHECK_LE(ra_exe_unit.target_exprs.size(), slot_count);
-  eachAggregateTargetIdxOfType(
-      ra_exe_unit.target_exprs,
-      kAPPROX_QUANTILE,
-      [&](Analyzer::AggExpr const* const agg_expr, size_t const target_idx) {
-        size_t const agg_col_idx =
-            query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
-        CHECK_LT(agg_col_idx, slot_count);
-        CHECK_EQ(static_cast<int8_t>(sizeof(int64_t)),
-                 query_mem_desc.getLogicalSlotWidthBytes(agg_col_idx));
-        auto const q_expr =
-            dynamic_cast<Analyzer::Constant const*>(agg_expr->get_arg1().get());
-        CHECK(q_expr);
-        auto const q = q_expr->get_constval().doubleval;
-        // allocate for APPROX_QUANTILE only when slot is used
-        init_agg_vals_[agg_col_idx] =
-            reinterpret_cast<int64_t>(row_set_mem_owner_->nullTDigest(q));
-      });
+
+  auto const& descs = query_mem_desc.getApproxQuantileDescriptors();
+  size_t approx_quantile_descriptors_idx = 0u;
+  ra_exe_unit.eachAggTarget<kAPPROX_QUANTILE>([&](Analyzer::AggExpr const* const agg_expr,
+                                                  size_t const target_idx) {
+    size_t const agg_col_idx = query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
+    CHECK_LT(agg_col_idx, slot_count);
+    CHECK_EQ(static_cast<int8_t>(sizeof(int64_t)),
+             query_mem_desc.getLogicalSlotWidthBytes(agg_col_idx));
+    auto const q_expr =
+        dynamic_cast<Analyzer::Constant const*>(agg_expr->get_arg1().get());
+    CHECK(q_expr);
+    auto const q = q_expr->get_constval().doubleval;
+    auto const& desc = descs.at(approx_quantile_descriptors_idx++);
+    init_agg_vals_[agg_col_idx] =
+        reinterpret_cast<int64_t>(row_set_mem_owner_->initTDigest(thread_idx_, desc, q));
+  });
 }
 
 GpuGroupByBuffers QueryMemoryInitializer::prepareTopNHeapsDevBuffer(
diff --git a/QueryEngine/QueryMemoryInitializer.h b/QueryEngine/QueryMemoryInitializer.h
index f193a58241..e4e300a887 100644
--- a/QueryEngine/QueryMemoryInitializer.h
+++ b/QueryEngine/QueryMemoryInitializer.h
@@ -42,7 +42,7 @@ class QueryMemoryInitializer {
     bool has_tdigest{false};
     std::vector<int64_t> count_distinct_buf_size;
     ModeIndexSet mode_index_set;
-    std::vector<QuantileParam> qualtile_params;
+    std::vector<QuantileParam> quantile_params;
   };
 
   // Row-based execution constructor
diff --git a/QueryEngine/RelAlgDag.cpp b/QueryEngine/RelAlgDag.cpp
index b9c5f918c0..a44dcff2eb 100644
--- a/QueryEngine/RelAlgDag.cpp
+++ b/QueryEngine/RelAlgDag.cpp
@@ -1394,8 +1394,11 @@ std::unique_ptr<const RexScalar> parse_scalar_expr(const rapidjson::Value& expr,
     }
     return std::unique_ptr<const RexScalar>(parse_operator(expr, root_dag));
   }
-  throw QueryNotSupported("Expression node " + json_node_to_string(expr) +
-                          " not supported");
+  std::string const node_str = json_node_to_string(expr);
+  if (node_str.find("\"correl\":\"$cor") != std::string::npos) {
+    throw QueryNotSupported("Unable to decorrelate one of the correlated subqueries.");
+  }
+  throw QueryNotSupported("Expression node " + node_str + " not supported");
 }
 
 JoinType to_join_type(const std::string& join_type_name) {
@@ -2776,7 +2779,8 @@ void add_window_function_pre_project(
       window_func_project_node->replaceInput(prev_node, new_project);
       window_func_project_node->setExpressions(scalar_exprs_for_window_project);
     } else {
-      // only push rex_inputs listed in the window function down to a new project node
+      // try to push rex_inputs listed in the projection target exprs down to a new
+      // project node
       RexInputSet inputs;
       RexInputCollector input_collector;
       for (size_t i = 0; i < window_func_project_node->size(); i++) {
@@ -2784,26 +2788,45 @@ void add_window_function_pre_project(
             input_collector.visit(window_func_project_node->getProjectAt(i));
         inputs.insert(new_inputs.begin(), new_inputs.end());
       }
-
-      // Note: Technically not required since we are mapping old inputs to new input
-      // indices, but makes the re-mapping of inputs easier to follow.
-      std::vector<RexInput> sorted_inputs(inputs.begin(), inputs.end());
-      std::sort(sorted_inputs.begin(),
-                sorted_inputs.end(),
-                [](const auto& a, const auto& b) { return a.getIndex() < b.getIndex(); });
-
       std::vector<std::unique_ptr<const RexScalar>> scalar_exprs;
       std::vector<std::string> fields;
       std::unordered_map<unsigned, unsigned> old_index_to_new_index;
-      for (auto& input : sorted_inputs) {
-        CHECK_EQ(input.getSourceNode(), prev_node.get());
-        CHECK(old_index_to_new_index
-                  .insert(std::make_pair(input.getIndex(), scalar_exprs.size()))
-                  .second);
-        scalar_exprs.emplace_back(input.deepCopy());
+
+      if (inputs.empty()) {
+        // this case only happens when the input is multi-fragmented but has no expr(s)
+        // to push down in the window_func_project_node's target exprs such as
+        // SELECT SUM(1), ROW_NUMBER() over () FROM test where test is multi-fragmented
+        // to handle this, we push an artificial literal down to the child project node
+        // to make an input of window_func_project_node a single-fragmented table
+        CHECK(has_multi_fragment_scan_input);
+        CHECK(!needs_expr_pushdown);
+        auto const bool_scale = std::numeric_limits<int32_t>::min();
+        scalar_exprs.push_back(std::make_unique<RexLiteral>(
+            true, SQLTypes::kBOOLEAN, SQLTypes::kBOOLEAN, bool_scale, 1, bool_scale, 1));
+        old_index_to_new_index.insert(std::make_pair(0, 0));
         fields.emplace_back("");
+      } else {
+        // we have at least one rex_input to pushdown
+        // let's make sure we have the correct # of exprs to pushdown
+        std::vector<RexInput> sorted_inputs(inputs.begin(), inputs.end());
+        std::sort(
+            sorted_inputs.begin(), sorted_inputs.end(), [](const auto& a, const auto& b) {
+              return a.getIndex() < b.getIndex();
+            });
+
+        for (auto& input : sorted_inputs) {
+          CHECK_EQ(input.getSourceNode(), prev_node.get());
+          CHECK(old_index_to_new_index
+                    .insert(std::make_pair(input.getIndex(), scalar_exprs.size()))
+                    .second);
+          scalar_exprs.emplace_back(input.deepCopy());
+          fields.emplace_back("");
+        }
       }
-
+      // modify window_func_project_node's target exprs to refer to push-downed expr in
+      // the new projection node
+      CHECK_GT(scalar_exprs.size(), 0UL);
+      CHECK_EQ(scalar_exprs.size(), fields.size());
       auto new_project = std::make_shared<RelProject>(scalar_exprs, fields, prev_node);
       propagate_hints_to_new_project(window_func_project_node, new_project, query_hints);
       new_project->setPushedDownWindowExpr();
diff --git a/QueryEngine/RelAlgDag.h b/QueryEngine/RelAlgDag.h
index 4e7a254d97..f184e4aafb 100644
--- a/QueryEngine/RelAlgDag.h
+++ b/QueryEngine/RelAlgDag.h
@@ -600,7 +600,7 @@ class RexWindowFunctionOperator : public RexFunctionOperator {
 
   // default constructor used for deserialization only
   RexWindowFunctionOperator()
-      : RexFunctionOperator(), kind_{SqlWindowFunctionKind::INVALID}, is_rows_{false} {}
+      : RexFunctionOperator(), kind_{SqlWindowFunctionKind::UNKNOWN}, is_rows_{false} {}
 
   RexWindowFunctionOperator(const SqlWindowFunctionKind kind,
                             ConstRexScalarPtrVector& operands,
@@ -3243,6 +3243,35 @@ class RelAlgDag : public boost::noncopyable {
           }
           break;
         }
+        case QueryHint::kTableReorderingOff: {
+          query_hint.registerHint(QueryHint::kTableReorderingOff);
+          query_hint.table_reordering_off = true;
+          if (target.isGlobalHint()) {
+            global_query_hint.registerHint(QueryHint::kTableReorderingOff);
+            global_query_hint.table_reordering_off = true;
+          }
+          break;
+        }
+        case QueryHint::kNDVGroupsEstimatorMultiplier: {
+          CHECK_EQ(1u, target.getListOptions().size());
+          double ndv_groups_estimator_multiplier = std::stod(target.getListOptions()[0]);
+          if (ndv_groups_estimator_multiplier < 1.0 ||
+              ndv_groups_estimator_multiplier > 2.0) {
+            VLOG(1) << "Skip the given query hint \"ndv_groups_estimator_multiplier\" ("
+                    << target.getListOptions()[0]
+                    << ") : the valid hint value range is 1.0 <= "
+                       "ndv_groups_estimator_multiplier <= 2.0";
+          } else {
+            query_hint.registerHint(QueryHint::kNDVGroupsEstimatorMultiplier);
+            query_hint.ndv_groups_estimator_multiplier = ndv_groups_estimator_multiplier;
+            if (target.isGlobalHint()) {
+              global_query_hint.registerHint(QueryHint::kNDVGroupsEstimatorMultiplier);
+              global_query_hint.ndv_groups_estimator_multiplier =
+                  ndv_groups_estimator_multiplier;
+            }
+          }
+          break;
+        }
         default:
           break;
       }
diff --git a/QueryEngine/RelAlgDagSerializer/serialization/QueryHintSerializer.h b/QueryEngine/RelAlgDagSerializer/serialization/QueryHintSerializer.h
index a055facb11..d69dd1d994 100644
--- a/QueryEngine/RelAlgDagSerializer/serialization/QueryHintSerializer.h
+++ b/QueryEngine/RelAlgDagSerializer/serialization/QueryHintSerializer.h
@@ -13,7 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #pragma once
 
 #include "QueryEngine/QueryHint.h"
@@ -29,22 +28,29 @@ void serialize(Archive& ar, RegisteredQueryHint& query_hint, const unsigned int
   (ar & query_hint.cpu_mode);
   (ar & query_hint.columnar_output);
   (ar & query_hint.rowwise_output);
-  (ar & query_hint.keep_result);
-  (ar & query_hint.keep_table_function_result);
-  (ar & query_hint.watchdog);
-  (ar & query_hint.dynamic_watchdog);
-  (ar & query_hint.query_time_limit);
-  (ar & query_hint.cuda_block_size);
-  (ar & query_hint.cuda_grid_size_multiplier);
-  (ar & query_hint.aggregate_tree_fanout);
   (ar & query_hint.bbox_intersect_bucket_threshold);
   (ar & query_hint.bbox_intersect_max_size);
   (ar & query_hint.bbox_intersect_allow_gpu_build);
   (ar & query_hint.bbox_intersect_no_cache);
   (ar & query_hint.bbox_intersect_keys_per_bin);
+  (ar & query_hint.keep_result);
+  (ar & query_hint.keep_table_function_result);
+  (ar & query_hint.aggregate_tree_fanout);
+  (ar & query_hint.cuda_block_size);
+  (ar & query_hint.cuda_grid_size_multiplier);
+  (ar & query_hint.opt_cuda_grid_and_block_size);
+  (ar & query_hint.watchdog);
+  (ar & query_hint.dynamic_watchdog);
+  (ar & query_hint.query_time_limit);
   (ar & query_hint.use_loop_join);
-  (ar & query_hint.max_join_hash_table_size);
   (ar & query_hint.loop_join_inner_table_max_num_rows);
+  (ar & query_hint.max_join_hash_table_size);
+  (ar & query_hint.force_baseline_hash_join);
+  (ar & query_hint.force_one_to_many_hash_join);
+  (ar & query_hint.watchdog_max_projected_rows_per_device);
+  (ar & query_hint.preflight_count_query_threshold);
+  (ar & query_hint.table_reordering_off);
+  (ar & query_hint.ndv_groups_estimator_multiplier);
   (ar & query_hint.registered_hint);
 }
 
diff --git a/QueryEngine/RelAlgExecutionUnit.h b/QueryEngine/RelAlgExecutionUnit.h
index 72d55bf82b..1764116881 100644
--- a/QueryEngine/RelAlgExecutionUnit.h
+++ b/QueryEngine/RelAlgExecutionUnit.h
@@ -35,6 +35,7 @@
 
 #include <boost/graph/adjacency_list.hpp>
 
+#include <functional>
 #include <list>
 #include <memory>
 #include <optional>
@@ -187,6 +188,20 @@ struct RelAlgExecutionUnit {
   RelAlgExecutionUnit createNdvExecutionUnit(const int64_t range) const;
   RelAlgExecutionUnit createCountAllExecutionUnit(
       Analyzer::Expr* replacement_target) const;
+
+  // Call lambda() for each aggregate target_expr of SQLAgg type AggType.
+  template <SQLAgg AggType>
+  void eachAggTarget(
+      std::function<void(Analyzer::AggExpr const*, size_t target_idx)> lambda) const {
+    for (size_t target_idx = 0; target_idx < target_exprs.size(); ++target_idx) {
+      Analyzer::Expr const* target_expr = target_exprs[target_idx];
+      if (auto const* agg_expr = dynamic_cast<Analyzer::AggExpr const*>(target_expr)) {
+        if (agg_expr->get_aggtype() == AggType) {
+          lambda(agg_expr, target_idx);
+        }
+      }
+    }
+  }
 };
 
 std::ostream& operator<<(std::ostream& os, const RelAlgExecutionUnit& ra_exe_unit);
diff --git a/QueryEngine/RelAlgExecutor.cpp b/QueryEngine/RelAlgExecutor.cpp
index 41b9f48f18..52d3499860 100644
--- a/QueryEngine/RelAlgExecutor.cpp
+++ b/QueryEngine/RelAlgExecutor.cpp
@@ -55,6 +55,7 @@ bool g_skip_intermediate_count{true};
 bool g_enable_interop{false};
 bool g_enable_union{true};  // DEPRECATED
 size_t g_estimator_failure_max_groupby_size{256000000};
+double g_ndv_groups_estimator_multiplier{2.0};
 bool g_columnar_large_projections{true};
 size_t g_columnar_large_projections_threshold{1000000};
 
@@ -678,7 +679,7 @@ ExecutionResult RelAlgExecutor::executeRelAlgQueryNoRetry(const CompilationOptio
     try {
       executor_->checkPendingQueryStatus(query_session);
     } catch (QueryExecutionError& e) {
-      if (e.getErrorCode() == Executor::ERR_INTERRUPTED) {
+      if (e.hasErrorCode(ErrorCode::INTERRUPTED)) {
         throw std::runtime_error("Query execution has been interrupted (pending query)");
       }
       throw e;
@@ -880,7 +881,7 @@ void RelAlgExecutor::prepareLeafExecution(
   }
   queue_time_ms_ = timer_stop(clock_begin);
   executor_->row_set_mem_owner_ = std::make_shared<RowSetMemoryOwner>(
-      Executor::getArenaBlockSize(), executor_->executor_id_, cpu_threads());
+      Executor::getArenaBlockSize(), executor_->executor_id_);
   executor_->row_set_mem_owner_->setDictionaryGenerations(string_dictionary_generations);
   executor_->table_generations_ = table_generations;
   executor_->agg_col_range_cache_ = agg_col_range;
@@ -1128,6 +1129,11 @@ void handle_query_hint(RegisteredQueryHint const& query_hints,
       eo.optimize_cuda_block_and_grid_sizes = true;
     }
   }
+  if (query_hints.isHintRegistered(QueryHint::kTableReorderingOff)) {
+    // disable table reordering if `table_reordering_off` is enabled
+    VLOG(1) << "A user disables table reordering listed in the FROM clause";
+    eo.table_reordering = false;
+  }
   if (query_hints.isHintRegistered(QueryHint::kColumnarOutput)) {
     VLOG(1) << "A user forces the query to run with columnar output";
     columnar_output_hint_enabled = true;
@@ -2387,9 +2393,10 @@ ExecutionResult RelAlgExecutor::executeTableFunction(const RelTableFunction* tab
                                                      executor_->gridSize()),
                          {}};
 
-  auto global_hint = getGlobalQueryHint();
-  auto use_resultset_recycler = canUseResultsetCache(eo, nullptr);
-  if (use_resultset_recycler && has_valid_query_plan_dag(table_func)) {
+  auto const global_hint = getGlobalQueryHint();
+  auto const use_resultset_recycler =
+      canUseResultsetCache(eo, nullptr) && has_valid_query_plan_dag(table_func);
+  if (use_resultset_recycler) {
     auto cached_resultset =
         executor_->getResultSetRecyclerHolder().getCachedQueryResultSet(
             table_func->getQueryPlanDagHash());
@@ -2409,33 +2416,35 @@ ExecutionResult RelAlgExecutor::executeTableFunction(const RelTableFunction* tab
               body->getOutputMetainfo()};
   } catch (const QueryExecutionError& e) {
     handlePersistentError(e.getErrorCode());
-    CHECK(e.getErrorCode() == Executor::ERR_OUT_OF_GPU_MEM);
+    CHECK(e.hasErrorCode(ErrorCode::OUT_OF_GPU_MEM)) << e.getErrorCode();
     throw std::runtime_error("Table function ran out of memory during execution");
   }
   auto query_exec_time = timer_stop(query_exec_time_begin);
   result.setQueueTime(queue_time_ms);
   auto resultset_ptr = result.getDataPtr();
-  auto allow_auto_caching_resultset = resultset_ptr && resultset_ptr->hasValidBuffer() &&
-                                      g_allow_auto_resultset_caching &&
-                                      resultset_ptr->getBufferSizeBytes(co.device_type) <=
-                                          g_auto_resultset_caching_threshold;
-  bool keep_result = global_hint->isHintRegistered(QueryHint::kKeepTableFuncResult);
-  if (use_resultset_recycler && (keep_result || allow_auto_caching_resultset) &&
-      !hasStepForUnion()) {
+  if (use_resultset_recycler) {
     resultset_ptr->setExecTime(query_exec_time);
     resultset_ptr->setQueryPlanHash(table_func_work_unit.exe_unit.query_plan_dag_hash);
     resultset_ptr->setTargetMetaInfo(body->getOutputMetainfo());
     auto input_table_keys = ScanNodeTableKeyCollector::getScanNodeTableKey(body);
     resultset_ptr->setInputTableKeys(std::move(input_table_keys));
-    if (allow_auto_caching_resultset) {
-      VLOG(1) << "Automatically keep table function's query resultset to recycler";
-    }
-    executor_->getResultSetRecyclerHolder().putQueryResultSetToCache(
-        table_func_work_unit.exe_unit.query_plan_dag_hash,
-        resultset_ptr->getInputTableKeys(),
-        resultset_ptr,
-        resultset_ptr->getBufferSizeBytes(co.device_type),
-        target_exprs_owned_);
+    auto allow_auto_caching_resultset =
+        resultset_ptr && resultset_ptr->hasValidBuffer() &&
+        g_allow_auto_resultset_caching &&
+        resultset_ptr->getBufferSizeBytes(co.device_type) <=
+            g_auto_resultset_caching_threshold;
+    if (global_hint->isHintRegistered(QueryHint::kKeepTableFuncResult) ||
+        allow_auto_caching_resultset) {
+      if (allow_auto_caching_resultset) {
+        VLOG(1) << "Automatically keep table function's query resultset to recycler";
+      }
+      executor_->getResultSetRecyclerHolder().putQueryResultSetToCache(
+          table_func_work_unit.exe_unit.query_plan_dag_hash,
+          resultset_ptr->getInputTableKeys(),
+          resultset_ptr,
+          resultset_ptr->getBufferSizeBytes(co.device_type),
+          target_exprs_owned_);
+    }
   } else {
     if (eo.keep_result) {
       if (g_cluster) {
@@ -2778,7 +2787,6 @@ ExecutionResult RelAlgExecutor::executeLogicalValues(
                                          false,
                                          /*is_varlen_projection=*/false});
   }
-
   std::shared_ptr<ResultSet> rs{
       ResultSetLogicalValuesBuilder{logical_values,
                                     target_infos,
@@ -3295,8 +3303,7 @@ ExecutionResult RelAlgExecutor::executeSort(const RelSort* sort,
     SortInfo sort_info{order_entries, sort_algorithm, limit, offset};
     auto source_query_plan_dag = QueryPlanDagExtractor::applyLimitClauseToCacheKey(
         source_node->getQueryPlanDagHash(), sort_info);
-    bool enable_resultset_recycler = canUseResultsetCache(eo, render_info);
-    if (enable_resultset_recycler && has_valid_query_plan_dag(source_node) &&
+    if (canUseResultsetCache(eo, render_info) && has_valid_query_plan_dag(source_node) &&
         !sort->isEmptyResult()) {
       if (auto cached_resultset =
               executor_->getResultSetRecyclerHolder().getCachedQueryResultSet(
@@ -3870,12 +3877,13 @@ ExecutionResult RelAlgExecutor::executeWorkUnit(
       handlePersistentError(e.getErrorCode());
       return handleOutOfMemoryRetry(
           {ra_exe_unit, work_unit.body, local_groups_buffer_entry_guess},
+          column_cache,
           targets_meta,
           is_agg,
           co,
           eo,
           render_info,
-          e.wasMultifragKernelLaunch(),
+          e,
           queue_time_ms);
     }
   };
@@ -3923,9 +3931,17 @@ ExecutionResult RelAlgExecutor::executeWorkUnit(
     } else {
       const auto ndv_groups_estimation =
           getNDVEstimation(work_unit, e.range(), is_agg, co, eo);
+      auto ndv_groups_estimator_multiplier = g_ndv_groups_estimator_multiplier;
+      if (query_hints.isHintRegistered(QueryHint::kNDVGroupsEstimatorMultiplier)) {
+        ndv_groups_estimator_multiplier = query_hints.ndv_groups_estimator_multiplier;
+        VLOG(1) << "Modify NDV groups estimator multiplier: "
+                << g_ndv_groups_estimator_multiplier << " -> "
+                << ndv_groups_estimator_multiplier;
+      }
       const auto estimated_groups_buffer_entry_guess =
           ndv_groups_estimation > 0
-              ? 2 * ndv_groups_estimation
+              ? static_cast<size_t>(static_cast<double>(ndv_groups_estimation) *
+                                    ndv_groups_estimator_multiplier)
               : std::min(groups_approx_upper_bound(table_infos).first,
                          g_estimator_failure_max_groupby_size);
       CHECK_GT(estimated_groups_buffer_entry_guess, size_t(0));
@@ -3969,27 +3985,29 @@ ExecutionResult RelAlgExecutor::executeWorkUnit(
   }
 
   const auto res = result.getDataPtr();
-  auto allow_auto_caching_resultset =
-      res && res->hasValidBuffer() && g_allow_auto_resultset_caching &&
-      res->getBufferSizeBytes(co.device_type) <= g_auto_resultset_caching_threshold;
-  if (use_resultset_cache && (eo.keep_result || allow_auto_caching_resultset)) {
+  if (use_resultset_cache && has_valid_query_plan_dag(body)) {
     auto query_exec_time = timer_stop(query_exec_time_begin);
     res->setExecTime(query_exec_time);
     res->setQueryPlanHash(ra_exe_unit.query_plan_dag_hash);
     res->setTargetMetaInfo(body->getOutputMetainfo());
     auto input_table_keys = ScanNodeTableKeyCollector::getScanNodeTableKey(body);
     res->setInputTableKeys(std::move(input_table_keys));
-    if (allow_auto_caching_resultset) {
-      VLOG(1) << "Automatically keep query resultset to recycler";
-    }
-    res->setUseSpeculativeTopNSort(
-        use_speculative_top_n(ra_exe_unit, res->getQueryMemDesc()));
-    executor_->getResultSetRecyclerHolder().putQueryResultSetToCache(
-        ra_exe_unit.query_plan_dag_hash,
-        res->getInputTableKeys(),
-        res,
-        res->getBufferSizeBytes(co.device_type),
-        target_exprs_owned_);
+    auto allow_auto_caching_resultset =
+        res && res->hasValidBuffer() && g_allow_auto_resultset_caching &&
+        res->getBufferSizeBytes(co.device_type) <= g_auto_resultset_caching_threshold;
+    if (eo.keep_result || allow_auto_caching_resultset) {
+      if (allow_auto_caching_resultset) {
+        VLOG(1) << "Automatically keep query resultset to recycler";
+      }
+      res->setUseSpeculativeTopNSort(
+          use_speculative_top_n(ra_exe_unit, res->getQueryMemDesc()));
+      executor_->getResultSetRecyclerHolder().putQueryResultSetToCache(
+          ra_exe_unit.query_plan_dag_hash,
+          res->getInputTableKeys(),
+          res,
+          res->getBufferSizeBytes(co.device_type),
+          target_exprs_owned_);
+    }
   } else {
     if (eo.keep_result) {
       if (g_cluster) {
@@ -4146,12 +4164,13 @@ bool RelAlgExecutor::isRowidLookup(const WorkUnit& work_unit) {
 
 ExecutionResult RelAlgExecutor::handleOutOfMemoryRetry(
     const RelAlgExecutor::WorkUnit& work_unit,
+    ColumnCacheMap& column_cache,
     const std::vector<TargetMetaInfo>& targets_meta,
     const bool is_agg,
     const CompilationOptions& co,
     const ExecutionOptions& eo,
     RenderInfo* render_info,
-    const bool was_multifrag_kernel_launch,
+    const QueryExecutionError& e,
     const int64_t queue_time_ms) {
   // Disable the bump allocator
   // Note that this will have basically the same affect as using the bump allocator for
@@ -4174,7 +4193,7 @@ ExecutionResult RelAlgExecutor::handleOutOfMemoryRetry(
   eo_no_multifrag.setNoExplainExecutionOptions(true);
   eo_no_multifrag.allow_multifrag = false;
   eo_no_multifrag.find_push_down_candidates = false;
-  if (was_multifrag_kernel_launch) {
+  if (e.wasMultifragKernelLaunch()) {
     try {
       // Attempt to retry using the kernel per fragment path. The smaller input size
       // required may allow the entire kernel to execute in GPU memory.
@@ -4182,7 +4201,6 @@ ExecutionResult RelAlgExecutor::handleOutOfMemoryRetry(
                       "kernels disabled.";
       const auto ra_exe_unit = decide_approx_count_distinct_implementation(
           ra_exe_unit_in, table_infos, executor_, co.device_type, target_exprs_owned_);
-      ColumnCacheMap column_cache;
       result = {executor_->executeWorkUnit(max_groups_buffer_entry_guess,
                                            is_agg,
                                            table_infos,
@@ -4194,8 +4212,8 @@ ExecutionResult RelAlgExecutor::handleOutOfMemoryRetry(
                                            column_cache),
                 targets_meta};
       result.setQueueTime(queue_time_ms);
-    } catch (const QueryExecutionError& e) {
-      handlePersistentError(e.getErrorCode());
+    } catch (const QueryExecutionError& new_e) {
+      handlePersistentError(new_e.getErrorCode());
       LOG(WARNING) << "Kernel per fragment query ran out of memory, retrying on CPU.";
     }
   }
@@ -4205,20 +4223,20 @@ ExecutionResult RelAlgExecutor::handleOutOfMemoryRetry(
   }
 
   const auto co_cpu = CompilationOptions::makeCpuOnly(co);
-  // Only reset the group buffer entry guess if we ran out of slots, which
-  // suggests a
-  // highly pathological input which prevented a good estimation of distinct tuple
-  // count. For projection queries, this will force a per-fragment scan limit, which is
-  // compatible with the CPU path
-  VLOG(1) << "Resetting max groups buffer entry guess.";
-  max_groups_buffer_entry_guess = 0;
+  if (e.getErrorCode() < 0) {
+    // Only reset the group buffer entry guess if we ran out of slots, which
+    // suggests a highly pathological input which prevented a good estimation of distinct
+    // tuple count. For projection queries, this will force a per-fragment scan limit,
+    // which is compatible with the CPU path
+    VLOG(1) << "Resetting max groups buffer entry guess.";
+    max_groups_buffer_entry_guess = 0;
+  }
 
   int iteration_ctr = -1;
   while (true) {
     iteration_ctr++;
     auto ra_exe_unit = decide_approx_count_distinct_implementation(
         ra_exe_unit_in, table_infos, executor_, co_cpu.device_type, target_exprs_owned_);
-    ColumnCacheMap column_cache;
     try {
       result = {executor_->executeWorkUnit(max_groups_buffer_entry_guess,
                                            is_agg,
@@ -4230,9 +4248,9 @@ ExecutionResult RelAlgExecutor::handleOutOfMemoryRetry(
                                            true,
                                            column_cache),
                 targets_meta};
-    } catch (const QueryExecutionError& e) {
+    } catch (const QueryExecutionError& new_e) {
       // Ran out of slots
-      if (e.getErrorCode() < 0) {
+      if (new_e.getErrorCode() < 0) {
         // Even the conservative guess failed; it should only happen when we group
         // by a huge cardinality array. Maybe we should throw an exception instead?
         // Such a heavy query is entirely capable of exhausting all the host memory.
@@ -4248,7 +4266,7 @@ ExecutionResult RelAlgExecutor::handleOutOfMemoryRetry(
                         "guess equal to "
                      << max_groups_buffer_entry_guess;
       } else {
-        handlePersistentError(e.getErrorCode());
+        handlePersistentError(new_e.getErrorCode());
       }
       continue;
     }
@@ -4261,7 +4279,7 @@ ExecutionResult RelAlgExecutor::handleOutOfMemoryRetry(
 void RelAlgExecutor::handlePersistentError(const int32_t error_code) {
   LOG(ERROR) << "Query execution failed with error "
              << getErrorMessageFromCode(error_code);
-  if (error_code == Executor::ERR_OUT_OF_GPU_MEM) {
+  if (error_code == int32_t(ErrorCode::OUT_OF_GPU_MEM)) {
     // We ran out of GPU memory, this doesn't count as an error if the query is
     // allowed to continue on CPU because retry on CPU is explicitly allowed through
     // --allow-cpu-retry.
@@ -4283,54 +4301,12 @@ struct ErrorInfo {
 ErrorInfo getErrorDescription(const int32_t error_code) {
   // 'designated initializers' don't compile on Windows for std 17
   // They require /std:c++20.  They been removed for the windows port.
-  switch (error_code) {
-    case Executor::ERR_DIV_BY_ZERO:
-      return {"ERR_DIV_BY_ZERO", "Division by zero"};
-    case Executor::ERR_OUT_OF_GPU_MEM:
-      return {"ERR_OUT_OF_GPU_MEM",
-
-              "Query couldn't keep the entire working set of columns in GPU memory"};
-    case Executor::ERR_UNSUPPORTED_SELF_JOIN:
-      return {"ERR_UNSUPPORTED_SELF_JOIN", "Self joins not supported yet"};
-    case Executor::ERR_OUT_OF_CPU_MEM:
-      return {"ERR_OUT_OF_CPU_MEM", "Not enough host memory to execute the query"};
-    case Executor::ERR_OVERFLOW_OR_UNDERFLOW:
-      return {"ERR_OVERFLOW_OR_UNDERFLOW", "Overflow or underflow"};
-    case Executor::ERR_OUT_OF_TIME:
-      return {"ERR_OUT_OF_TIME", "Query execution has exceeded the time limit"};
-    case Executor::ERR_INTERRUPTED:
-      return {"ERR_INTERRUPTED", "Query execution has been interrupted"};
-    case Executor::ERR_COLUMNAR_CONVERSION_NOT_SUPPORTED:
-      return {"ERR_COLUMNAR_CONVERSION_NOT_SUPPORTED",
-              "Columnar conversion not supported for variable length types"};
-    case Executor::ERR_TOO_MANY_LITERALS:
-      return {"ERR_TOO_MANY_LITERALS", "Too many literals in the query"};
-    case Executor::ERR_STRING_CONST_IN_RESULTSET:
-      return {"ERR_STRING_CONST_IN_RESULTSET",
-
-              "NONE ENCODED String types are not supported as input result set."};
-    case Executor::ERR_OUT_OF_RENDER_MEM:
-      return {"ERR_OUT_OF_RENDER_MEM",
-
-              "Insufficient GPU memory for query results in render output buffer "
-              "sized by render-mem-bytes"};
-    case Executor::ERR_STREAMING_TOP_N_NOT_SUPPORTED_IN_RENDER_QUERY:
-      return {"ERR_STREAMING_TOP_N_NOT_SUPPORTED_IN_RENDER_QUERY",
-              "Streaming-Top-N not supported in Render Query"};
-    case Executor::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES:
-      return {"ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES",
-              "Multiple distinct values encountered"};
-    case Executor::ERR_GEOS:
-      return {"ERR_GEOS", "ERR_GEOS"};
-    case Executor::ERR_WIDTH_BUCKET_INVALID_ARGUMENT:
-      return {"ERR_WIDTH_BUCKET_INVALID_ARGUMENT",
-
-              "Arguments of WIDTH_BUCKET function does not satisfy the condition"};
-    default:
-      return {nullptr, nullptr};
+  if (0 < error_code && error_code < int32_t(ErrorCode::N_)) {
+    auto const ec = static_cast<ErrorCode>(error_code);
+    return {to_string(ec), to_description(ec)};
   }
+  return {nullptr, nullptr};
 }
-
 }  // namespace
 
 std::string RelAlgExecutor::getErrorMessageFromCode(const int32_t error_code) {
@@ -4560,7 +4536,7 @@ RelAlgExecutor::WorkUnit RelAlgExecutor::createCompoundWorkUnit(
     left_deep_join_input_sizes = get_left_deep_join_input_sizes(left_deep_join);
     left_deep_join_quals = translateLeftDeepJoinFilter(
         left_deep_join, input_descs, input_to_nest_level, eo.just_explain);
-    if (g_from_table_reordering &&
+    if (eo.table_reordering &&
         std::find(join_types.begin(), join_types.end(), JoinType::LEFT) ==
             join_types.end()) {
       input_permutation = do_table_reordering(input_descs,
@@ -4960,7 +4936,7 @@ RelAlgExecutor::WorkUnit RelAlgExecutor::createProjectWorkUnit(
     left_deep_join_input_sizes = get_left_deep_join_input_sizes(left_deep_join);
     left_deep_join_quals = translateLeftDeepJoinFilter(
         left_deep_join, input_descs, input_to_nest_level, eo.just_explain);
-    if (g_from_table_reordering) {
+    if (eo.table_reordering) {
       input_permutation = do_table_reordering(input_descs,
                                               input_col_descs,
                                               left_deep_join_quals,
diff --git a/QueryEngine/RelAlgExecutor.h b/QueryEngine/RelAlgExecutor.h
index 8c3dbaf5b3..2ac83fd7f7 100644
--- a/QueryEngine/RelAlgExecutor.h
+++ b/QueryEngine/RelAlgExecutor.h
@@ -19,6 +19,7 @@
 
 #include "Distributed/AggregatedResult.h"
 #include "QueryEngine/Descriptors/RelAlgExecutionDescriptor.h"
+#include "QueryEngine/ErrorHandling.h"
 #include "QueryEngine/Execute.h"
 #include "QueryEngine/InputMetadata.h"
 #include "QueryEngine/JoinFilterPushDown.h"
@@ -357,12 +358,13 @@ class RelAlgExecutor : private StorageIOFacility {
   bool isRowidLookup(const WorkUnit& work_unit);
 
   ExecutionResult handleOutOfMemoryRetry(const RelAlgExecutor::WorkUnit& work_unit,
+                                         ColumnCacheMap& column_cache,
                                          const std::vector<TargetMetaInfo>& targets_meta,
                                          const bool is_agg,
                                          const CompilationOptions& co,
                                          const ExecutionOptions& eo,
                                          RenderInfo* render_info,
-                                         const bool was_multifrag_kernel_launch,
+                                         const QueryExecutionError& e,
                                          const int64_t queue_time_ms);
 
   // Allows an out of memory error through if CPU retry is enabled. Otherwise, throws an
diff --git a/QueryEngine/RelAlgTranslator.cpp b/QueryEngine/RelAlgTranslator.cpp
index e71944fc0f..036a20a2aa 100644
--- a/QueryEngine/RelAlgTranslator.cpp
+++ b/QueryEngine/RelAlgTranslator.cpp
@@ -36,7 +36,10 @@
 #include <sstream>
 
 extern bool g_enable_watchdog;
-
+extern size_t g_watchdog_in_clause_max_num_elem_bitmap;
+extern size_t g_watchdog_in_clause_max_num_elem_non_bitmap;
+extern size_t g_watchdog_in_clause_max_num_input_rows;
+extern size_t g_in_clause_num_elem_skip_bitmap;
 bool g_enable_string_functions{true};
 
 namespace {
@@ -556,7 +559,7 @@ std::shared_ptr<Analyzer::Expr> RelAlgTranslator::translateInput(
 std::shared_ptr<Analyzer::Expr> RelAlgTranslator::translateUoper(
     const RexOperator* rex_operator) const {
   CHECK_EQ(size_t(1), rex_operator->size());
-  const auto operand_expr = translateScalarRex(rex_operator->getOperand(0));
+  auto operand_expr = translateScalarRex(rex_operator->getOperand(0));
   const auto sql_op = rex_operator->getOperator();
   switch (sql_op) {
     case kCAST: {
@@ -580,7 +583,17 @@ std::shared_ptr<Analyzer::Expr> RelAlgTranslator::translateUoper(
       return std::make_shared<Analyzer::UOper>(target_ti, false, sql_op, operand_expr);
     }
     case kENCODE_TEXT: {
-      const auto& target_ti = rex_operator->getType();
+      SQLTypeInfo target_ti = rex_operator->getType();
+      if (target_ti.get_type() == kNULLT) {
+        if (auto const_expr =
+                dynamic_cast<const Analyzer::Constant*>(operand_expr.get())) {
+          if (const_expr->get_type_info() == kNULLT && const_expr->get_is_null()) {
+            // make a typed NULL constant and sync it to target_ti
+            operand_expr = makeExpr<Analyzer::Constant>(kTEXT, true);
+            target_ti.set_type(kTEXT);
+          }
+        }
+      }
       CHECK_NE(kNULLT, target_ti.get_type());
       const auto& operand_ti = operand_expr->get_type_info();
       CHECK(operand_ti.is_string());
@@ -634,9 +647,12 @@ std::shared_ptr<Analyzer::Expr> get_in_values_expr(std::shared_ptr<Analyzer::Exp
   if (!result_set::can_use_parallel_algorithms(val_set)) {
     return nullptr;
   }
-  if (val_set.rowCount() > 5000000 && g_enable_watchdog) {
-    throw std::runtime_error(
-        "Unable to handle 'expr IN (subquery)', subquery returned 5M+ rows.");
+  if (val_set.rowCount() > g_watchdog_in_clause_max_num_input_rows && g_enable_watchdog) {
+    std::ostringstream oss;
+    oss << "Unable to handle 'expr IN (subquery)': # input rows (" << val_set.rowCount()
+        << ") is larger than threshold 'g_watchdog_in_clause_max_num_input_rows':"
+        << g_watchdog_in_clause_max_num_input_rows;
+    throw std::runtime_error(oss.str());
   }
   std::list<std::shared_ptr<Analyzer::Expr>> value_exprs;
   const size_t fetcher_count = cpu_threads();
@@ -721,18 +737,25 @@ std::shared_ptr<Analyzer::Expr> RelAlgTranslator::translateInOper(
         "The two sides of the IN operator must have the same type; found " +
         ti.get_type_name() + " and " + rhs_ti.get_type_name());
   }
+  ScopeGuard elapsed_time_log = [clock_begin = timer_start()] {
+    VLOG(1) << "RelAlgTranslator::translateInOper: took " << timer_stop(clock_begin)
+            << " ms";
+  };
   row_set->moveToBegin();
-  if (row_set->entryCount() > 10000) {
-    std::shared_ptr<Analyzer::Expr> expr;
-    if ((ti.is_integer() || (ti.is_string() && ti.get_compression() == kENCODING_DICT)) &&
-        !row_set->getQueryMemDesc().didOutputColumnar()) {
-      expr = getInIntegerSetExpr(lhs, *row_set);
-      // Handle the highly unlikely case when the InIntegerSet ended up being tiny.
-      // Just let it fall through the usual InValues path at the end of this method,
-      // its codegen knows to use inline comparisons for few values.
-      if (expr && std::static_pointer_cast<Analyzer::InIntegerSet>(expr)
-                          ->get_value_list()
-                          .size() <= 100) {
+  std::shared_ptr<Analyzer::Expr> expr;
+  if ((ti.is_integer() || (ti.is_string() && ti.get_compression() == kENCODING_DICT)) &&
+      !row_set->didOutputColumnar()) {
+    expr = getInIntegerSetExpr(lhs, *row_set);
+    // Handle the highly unlikely case when the InIntegerSet ended up being tiny.
+    // Just let it fall through the usual InValues path at the end of this method,
+    // its codegen knows to use inline comparisons for few values.
+    if (expr) {
+      auto const num_values =
+          std::static_pointer_cast<Analyzer::InIntegerSet>(expr)->get_value_list().size();
+      if (num_values <= g_in_clause_num_elem_skip_bitmap) {
+        VLOG(1) << "Skip to build a bitmap for tiny integer-set case: # values ("
+                << ::toString(num_values) << ") <= threshold ("
+                << ::toString(g_in_clause_num_elem_skip_bitmap) << ")";
         expr = nullptr;
       }
     } else {
@@ -748,9 +771,15 @@ std::shared_ptr<Analyzer::Expr> RelAlgTranslator::translateInOper(
     if (row.empty()) {
       break;
     }
-    if (g_enable_watchdog && value_exprs.size() >= 10000) {
-      throw std::runtime_error(
-          "Unable to handle 'expr IN (subquery)', subquery returned 10000+ rows.");
+    if (g_enable_watchdog &&
+        value_exprs.size() >= g_watchdog_in_clause_max_num_elem_non_bitmap) {
+      std::ostringstream oss;
+      oss << "Unable to handle 'expr IN (subquery)' via non-bitmap, # unique values ("
+          << value_exprs.size()
+          << ") is larger than the threshold "
+             "'g_watchdog_in_clause_max_num_elem_non_bitmap': "
+          << g_watchdog_in_clause_max_num_elem_non_bitmap;
+      throw std::runtime_error(oss.str());
     }
     auto scalar_tv = boost::get<ScalarTargetValue>(&row[0]);
     Datum d{0};
@@ -772,8 +801,6 @@ std::shared_ptr<Analyzer::Expr> RelAlgTranslator::translateInOper(
 
 namespace {
 
-const size_t g_max_integer_set_size{1 << 25};
-
 void fill_dictionary_encoded_in_vals(
     std::vector<int64_t>& in_vals,
     std::atomic<size_t>& total_in_vals_count,
@@ -802,9 +829,14 @@ void fill_dictionary_encoded_in_vals(
       }
     }
     if (UNLIKELY(g_enable_watchdog && (in_vals.size() & 1023) == 0 &&
-                 total_in_vals_count.fetch_add(1024) >= g_max_integer_set_size)) {
-      throw std::runtime_error(
-          "Unable to handle 'expr IN (subquery)', subquery returned 30M+ rows.");
+                 total_in_vals_count.fetch_add(1024) >=
+                     g_watchdog_in_clause_max_num_elem_bitmap)) {
+      std::ostringstream oss;
+      oss << "Unable to handle 'expr IN (subquery)' via bitmap, # unique encoded-string ("
+          << total_in_vals_count.load()
+          << ") is larger than the threshold 'g_watchdog_in_clause_max_num_elem_bitmap': "
+          << g_watchdog_in_clause_max_num_elem_bitmap;
+      throw std::runtime_error(oss.str());
     }
   }
 }
@@ -820,9 +852,16 @@ void fill_integer_in_vals(std::vector<int64_t>& in_vals,
     if (row.valid) {
       in_vals.push_back(row.value);
       if (UNLIKELY(g_enable_watchdog && (in_vals.size() & 1023) == 0 &&
-                   total_in_vals_count.fetch_add(1024) >= g_max_integer_set_size)) {
-        throw std::runtime_error(
-            "Unable to handle 'expr IN (subquery)', subquery returned 30M+ rows.");
+                   total_in_vals_count.fetch_add(1024) >=
+                       g_watchdog_in_clause_max_num_elem_bitmap)) {
+        std::ostringstream oss;
+        oss << "Unable to handle 'expr IN (subquery)' via bitmap, # unique integer "
+               "values ("
+            << total_in_vals_count.load()
+            << ") is larger than the threshold "
+               "'g_watchdog_in_clause_max_num_elem_bitmap': "
+            << g_watchdog_in_clause_max_num_elem_bitmap;
+        throw std::runtime_error(oss.str());
       }
     }
   }
@@ -861,9 +900,16 @@ void fill_dictionary_encoded_in_vals(
       if (row.value != needle_null_val) {
         in_vals.push_back(row.value);
         if (UNLIKELY(g_enable_watchdog && (in_vals.size() & 1023) == 0 &&
-                     total_in_vals_count.fetch_add(1024) >= g_max_integer_set_size)) {
-          throw std::runtime_error(
-              "Unable to handle 'expr IN (subquery)', subquery returned 30M+ rows.");
+                     total_in_vals_count.fetch_add(1024) >=
+                         g_watchdog_in_clause_max_num_elem_bitmap)) {
+          std::ostringstream oss;
+          oss << "Unable to handle 'expr IN (subquery)' via bitmap, # unique "
+                 "encoded-string values ("
+              << total_in_vals_count.load()
+              << ") is larger than the threshold "
+                 "'g_watchdog_in_clause_max_num_elem_bitmap': "
+              << g_watchdog_in_clause_max_num_elem_bitmap;
+          throw std::runtime_error(oss.str());
         }
       } else {
         has_nulls = true;
@@ -904,9 +950,16 @@ void fill_dictionary_encoded_in_vals(
     if (dest_id != StringDictionary::INVALID_STR_ID) {
       in_vals.push_back(dest_id);
       if (UNLIKELY(g_enable_watchdog && (in_vals.size() & 1023) == 0 &&
-                   total_in_vals_count.fetch_add(1024) >= g_max_integer_set_size)) {
-        throw std::runtime_error(
-            "Unable to handle 'expr IN (subquery)', subquery returned 30M+ rows.");
+                   total_in_vals_count.fetch_add(1024) >=
+                       g_watchdog_in_clause_max_num_elem_bitmap)) {
+        std::ostringstream oss;
+        oss << "Unable to handle 'expr IN (subquery)' via bitmap, # unique "
+               "encoded-string values ("
+            << total_in_vals_count.load()
+            << ") is larger than the threshold "
+               "'g_watchdog_in_clause_max_num_elem_bitmap': "
+            << g_watchdog_in_clause_max_num_elem_bitmap;
+        throw std::runtime_error(oss.str());
       }
     }
   }
@@ -959,8 +1012,8 @@ std::shared_ptr<Analyzer::Expr> RelAlgTranslator::getInIntegerSetExpr(
           col_type.getStringDictKey(), val_set.getRowSetMemOwner(), true);
       CHECK(sd);
       const auto needle_null_val = inline_int_null_val(arg_type);
-      const auto catalog = Catalog_Namespace::SysCatalog::instance().getCatalog(
-          col_expr->getColumnKey().db_id);
+      const auto catalog =
+          Catalog_Namespace::SysCatalog::instance().getCatalog(source_dict_key.db_id);
       CHECK(catalog);
       fetcher_threads.push_back(std::async(
           std::launch::async,
@@ -1515,6 +1568,8 @@ std::shared_ptr<Analyzer::Expr> RelAlgTranslator::translateStringOper(
       return makeExpr<Analyzer::RegexpReplaceStringOper>(args);
     case SqlStringOpKind::REGEXP_SUBSTR:
       return makeExpr<Analyzer::RegexpSubstrStringOper>(args);
+    case SqlStringOpKind::REGEXP_COUNT:
+      return makeExpr<Analyzer::RegexpCountStringOper>(args);
     case SqlStringOpKind::JSON_VALUE:
       return makeExpr<Analyzer::JsonValueStringOper>(args);
     case SqlStringOpKind::BASE64_ENCODE:
@@ -1522,6 +1577,11 @@ std::shared_ptr<Analyzer::Expr> RelAlgTranslator::translateStringOper(
     case SqlStringOpKind::BASE64_DECODE:
       return makeExpr<Analyzer::Base64DecodeStringOper>(args);
     case SqlStringOpKind::TRY_STRING_CAST:
+      if (rex_function->getType().is_string() &&
+          args.front()->get_type_info().is_string()) {
+        // ignore try_cast and return string as is
+        return args.front();
+      }
       return makeExpr<Analyzer::TryStringCastOper>(rex_function->getType(), args);
     case SqlStringOpKind::POSITION:
       return makeExpr<Analyzer::PositionStringOper>(args);
@@ -1529,6 +1589,12 @@ std::shared_ptr<Analyzer::Expr> RelAlgTranslator::translateStringOper(
       return makeExpr<Analyzer::JarowinklerSimilarityStringOper>(args);
     case SqlStringOpKind::LEVENSHTEIN_DISTANCE:
       return makeExpr<Analyzer::LevenshteinDistanceStringOper>(args);
+    case SqlStringOpKind::HASH:
+      return makeExpr<Analyzer::HashStringOper>(args);
+    case SqlStringOpKind::URL_ENCODE:
+      return makeExpr<Analyzer::UrlEncodeStringOper>(args);
+    case SqlStringOpKind::URL_DECODE:
+      return makeExpr<Analyzer::UrlDecodeStringOper>(args);
     default: {
       throw std::runtime_error("Unsupported string function.");
     }
@@ -1785,13 +1851,17 @@ std::shared_ptr<Analyzer::Expr> RelAlgTranslator::translateFunction(
                    "REGEXP_REPLACE"sv,
                    "REGEXP_SUBSTR"sv,
                    "REGEXP_MATCH"sv,
+                   "REGEXP_COUNT"sv,
                    "JSON_VALUE"sv,
                    "BASE64_ENCODE"sv,
                    "BASE64_DECODE"sv,
+                   "URL_ENCODE"sv,
+                   "URL_DECODE"sv,
                    "TRY_CAST"sv,
                    "POSITION"sv,
                    "JAROWINKLER_SIMILARITY"sv,
-                   "LEVENSHTEIN_DISTANCE"sv)) {
+                   "LEVENSHTEIN_DISTANCE"sv,
+                   "HASH"sv)) {
     return translateStringOper(rex_function);
   }
   if (func_resolve(rex_function->getName(), "CARDINALITY"sv, "ARRAY_LENGTH"sv)) {
@@ -2028,8 +2098,8 @@ std::shared_ptr<Analyzer::Expr> RelAlgTranslator::translateFunction(
     throw;
   }
 
-  // By default, the extension function type will not allow nulls. If one of the arguments
-  // is nullable, the extension function must also explicitly allow nulls.
+  // By default, the extension function type will not allow nulls. If one of the
+  // arguments is nullable, the extension function must also explicitly allow nulls.
   bool arguments_not_null = true;
   for (const auto& arg_expr : arg_expr_list) {
     if (!arg_expr->get_type_info().get_notnull()) {
@@ -2096,8 +2166,93 @@ ExtractField determineTimeUnit(const SQLTypes& window_frame_bound_type,
   CHECK(false);
   return kUNKNOWN_FIELD;
 }
+
+SqlWindowFrameBoundType determine_frame_bound_type(
+    const RexWindowFunctionOperator::RexWindowBound& bound) {
+  if (bound.unbounded) {
+    CHECK(!bound.bound_expr && !bound.is_current_row);
+    if (bound.following) {
+      return SqlWindowFrameBoundType::UNBOUNDED_FOLLOWING;
+    } else if (bound.preceding) {
+      return SqlWindowFrameBoundType::UNBOUNDED_PRECEDING;
+    }
+  } else {
+    if (bound.is_current_row) {
+      CHECK(!bound.unbounded && !bound.bound_expr);
+      return SqlWindowFrameBoundType::CURRENT_ROW;
+    } else {
+      CHECK(!bound.unbounded && bound.bound_expr);
+      if (bound.following) {
+        return SqlWindowFrameBoundType::EXPR_FOLLOWING;
+      } else if (bound.preceding) {
+        return SqlWindowFrameBoundType::EXPR_PRECEDING;
+      }
+    }
+  }
+  return SqlWindowFrameBoundType::UNKNOWN;
+}
+
+bool is_negative_framing_bound(const SQLTypes t,
+                               const Datum& d,
+                               bool is_time_unit = false) {
+  switch (t) {
+    case kTINYINT:
+      return d.tinyintval < 0;
+    case kSMALLINT:
+      return d.smallintval < 0;
+    case kINT:
+      return d.intval < 0;
+    case kDOUBLE: {
+      // the only case that double type is used is for handling time interval
+      // i.e., represent tiny time units like nanosecond and microsecond as the
+      // equivalent time value with SECOND time unit
+      CHECK(is_time_unit);
+      return d.doubleval < 0;
+    }
+    case kDECIMAL:
+    case kNUMERIC:
+    case kBIGINT:
+      return d.bigintval < 0;
+    default: {
+      throw std::runtime_error(
+          "We currently only support integer-type literal expression as a window "
+          "frame bound expression");
+    }
+  }
+}
+
 }  // namespace
 
+// this function returns three elements as a tuple as follows:
+// 1) `bound_expr` is invalid
+// 2) `bound_expr` has a negative constant
+// 3) a translated bound expr which has `Analyzer::Expr*` type
+std::tuple<bool, bool, std::shared_ptr<Analyzer::Expr>>
+RelAlgTranslator::translateFrameBoundExpr(const RexScalar* bound_expr) const {
+  bool negative_constant = false;
+  if (dynamic_cast<const RexOperator*>(bound_expr)) {
+    auto translated_expr = translateScalarRex(bound_expr);
+    const auto bin_oper = dynamic_cast<const Analyzer::BinOper*>(translated_expr.get());
+    auto time_literal_expr =
+        dynamic_cast<const Analyzer::Constant*>(bin_oper->get_left_operand());
+    CHECK(time_literal_expr);
+    negative_constant =
+        is_negative_framing_bound(time_literal_expr->get_type_info().get_type(),
+                                  time_literal_expr->get_constval(),
+                                  true);
+    return std::make_tuple(false, negative_constant, translated_expr);
+  } else if (dynamic_cast<const RexLiteral*>(bound_expr)) {
+    auto translated_expr = translateScalarRex(bound_expr);
+    if (auto literal_expr =
+            dynamic_cast<const Analyzer::Constant*>(translated_expr.get())) {
+      negative_constant = is_negative_framing_bound(
+          literal_expr->get_type_info().get_type(), literal_expr->get_constval());
+      return std::make_tuple(false, negative_constant, translated_expr);
+    }
+  }
+  return std::make_tuple(true, negative_constant, nullptr);
+}
+
 std::shared_ptr<Analyzer::Expr> RelAlgTranslator::translateWindowFunction(
     const RexWindowFunctionOperator* rex_window_function) const {
   std::vector<std::shared_ptr<Analyzer::Expr>> args;
@@ -2119,59 +2274,16 @@ std::shared_ptr<Analyzer::Expr> RelAlgTranslator::translateWindowFunction(
   auto window_func_kind = rex_window_function->getKind();
   if (window_function_is_value(window_func_kind)) {
     CHECK_GE(args.size(), 1u);
-    ti = args.front()->get_type_info();
-  }
-  auto determine_frame_bound_type =
-      [](const RexWindowFunctionOperator::RexWindowBound& bound) {
-        if (bound.unbounded) {
-          CHECK(!bound.bound_expr && !bound.is_current_row);
-          if (bound.following) {
-            return SqlWindowFrameBoundType::UNBOUNDED_FOLLOWING;
-          } else if (bound.preceding) {
-            return SqlWindowFrameBoundType::UNBOUNDED_PRECEDING;
-          }
-        } else {
-          if (bound.is_current_row) {
-            CHECK(!bound.unbounded && !bound.bound_expr);
-            return SqlWindowFrameBoundType::CURRENT_ROW;
-          } else {
-            CHECK(!bound.unbounded && bound.bound_expr);
-            if (bound.following) {
-              return SqlWindowFrameBoundType::EXPR_FOLLOWING;
-            } else if (bound.preceding) {
-              return SqlWindowFrameBoundType::EXPR_PRECEDING;
-            }
-          }
-        }
-        return SqlWindowFrameBoundType::UNKNOWN;
-      };
-  auto is_negative_framing_bound =
-      [](const SQLTypes t, const Datum& d, bool is_time_unit = false) {
-        switch (t) {
-          case kTINYINT:
-            return d.tinyintval < 0;
-          case kSMALLINT:
-            return d.smallintval < 0;
-          case kINT:
-            return d.intval < 0;
-          case kDOUBLE: {
-            // the only case that double type is used is for handling time interval
-            // i.e., represent tiny time units like nanosecond and microsecond as the
-            // equivalent time value with SECOND time unit
-            CHECK(is_time_unit);
-            return d.doubleval < 0;
-          }
-          case kDECIMAL:
-          case kNUMERIC:
-          case kBIGINT:
-            return d.bigintval < 0;
-          default: {
-            throw std::runtime_error(
-                "We currently only support integer-type literal expression as a window "
-                "frame bound expression");
-          }
-        }
-      };
+    if (!window_function_is_value_with_frame(window_func_kind)) {
+      // value window functions w/ frame have logic to access argument's typeinfo
+      // during codegen, i.e., codegenWindowNavigationFunctionOnFrame(...)
+      // but not for non-framed value window function, so we use their arg's typeinfo
+      // as window function's typeinfo
+      ti = args.front()->get_type_info();
+    }
+    // set value type window functions' nullability
+    ti.set_notnull(false);
+  }
 
   bool negative_constant = false;
   bool detect_invalid_frame_start_bound_expr = false;
@@ -2193,49 +2305,24 @@ std::shared_ptr<Analyzer::Expr> RelAlgTranslator::translateWindowFunction(
   if (order_keys.empty()) {
     if (frame_start_bound_type == SqlWindowFrameBoundType::UNBOUNDED_PRECEDING &&
         frame_end_bound_type == SqlWindowFrameBoundType::UNBOUNDED_FOLLOWING) {
-      // Calcite sets UNBOUNDED PRECEDING ~ UNBOUNDED_FOLLOWING as its default frame bound
-      // if the window context has no order by clause regardless of the existence of
-      // user-given window frame bound but at this point we have no way to recognize the
-      // absence of the frame definition of this window context
+      // Calcite sets UNBOUNDED PRECEDING ~ UNBOUNDED_FOLLOWING as its default frame
+      // bound if the window context has no order by clause regardless of the existence
+      // of user-given window frame bound but at this point we have no way to recognize
+      // the absence of the frame definition of this window context
       has_framing_clause = false;
     }
   } else {
-    auto translate_frame_bound_expr = [&](const RexScalar* bound_expr) {
-      std::shared_ptr<Analyzer::Expr> translated_expr;
-      const auto rex_oper = dynamic_cast<const RexOperator*>(bound_expr);
-      if (rex_oper && rex_oper->getType().is_timeinterval()) {
-        translated_expr = translateScalarRex(rex_oper);
-        const auto bin_oper =
-            dynamic_cast<const Analyzer::BinOper*>(translated_expr.get());
-        auto time_literal_expr =
-            dynamic_cast<const Analyzer::Constant*>(bin_oper->get_left_operand());
-        CHECK(time_literal_expr);
-        negative_constant =
-            is_negative_framing_bound(time_literal_expr->get_type_info().get_type(),
-                                      time_literal_expr->get_constval(),
-                                      true);
-        return std::make_pair(false, translated_expr);
-      }
-      if (dynamic_cast<const RexLiteral*>(bound_expr)) {
-        translated_expr = translateScalarRex(bound_expr);
-        if (auto literal_expr =
-                dynamic_cast<const Analyzer::Constant*>(translated_expr.get())) {
-          negative_constant = is_negative_framing_bound(
-              literal_expr->get_type_info().get_type(), literal_expr->get_constval());
-          return std::make_pair(false, translated_expr);
-        }
-      }
-      return std::make_pair(true, translated_expr);
-    };
-
     if (frame_start_bound.bound_expr) {
-      std::tie(detect_invalid_frame_start_bound_expr, frame_start_bound_expr) =
-          translate_frame_bound_expr(frame_start_bound.bound_expr.get());
+      std::tie(detect_invalid_frame_start_bound_expr,
+               negative_constant,
+               frame_start_bound_expr) =
+          translateFrameBoundExpr(frame_start_bound.bound_expr.get());
     }
 
     if (frame_end_bound.bound_expr) {
-      std::tie(detect_invalid_frame_end_bound_expr, frame_end_bound_expr) =
-          translate_frame_bound_expr(frame_end_bound.bound_expr.get());
+      std::tie(
+          detect_invalid_frame_end_bound_expr, negative_constant, frame_end_bound_expr) =
+          translateFrameBoundExpr(frame_end_bound.bound_expr.get());
     }
 
     // currently we only support literal expression as frame bound expression
@@ -2252,27 +2339,20 @@ std::shared_ptr<Analyzer::Expr> RelAlgTranslator::translateWindowFunction(
           "A constant expression for window framing should have nonnegative value.");
     }
 
-    auto handle_time_interval_expr_if_necessary = [&](const Analyzer::Expr* bound_expr,
-                                                      SqlWindowFrameBoundType bound_type,
-                                                      bool for_start_bound) {
-      if (bound_expr && bound_expr->get_type_info().is_timeinterval()) {
-        const auto bound_bin_oper = dynamic_cast<const Analyzer::BinOper*>(bound_expr);
-        CHECK(bound_bin_oper->get_optype() == kMULTIPLY);
-        auto translated_expr = translateIntervalExprForWindowFraming(
-            order_keys.front(),
-            bound_type == SqlWindowFrameBoundType::EXPR_PRECEDING,
-            bound_bin_oper);
-        if (for_start_bound) {
-          frame_start_bound_expr = translated_expr;
-        } else {
-          frame_end_bound_expr = translated_expr;
-        }
-      }
-    };
-    handle_time_interval_expr_if_necessary(
-        frame_start_bound_expr.get(), frame_start_bound_type, true);
-    handle_time_interval_expr_if_necessary(
-        frame_end_bound_expr.get(), frame_end_bound_type, false);
+    if (frame_start_bound_expr &&
+        frame_start_bound_expr->get_type_info().is_timeinterval()) {
+      frame_start_bound_expr = translateIntervalExprForWindowFraming(
+          order_keys.front(),
+          frame_start_bound_type == SqlWindowFrameBoundType::EXPR_PRECEDING,
+          frame_start_bound_expr.get());
+    }
+
+    if (frame_end_bound_expr && frame_end_bound_expr->get_type_info().is_timeinterval()) {
+      frame_end_bound_expr = translateIntervalExprForWindowFraming(
+          order_keys.front(),
+          frame_end_bound_type == SqlWindowFrameBoundType::EXPR_PRECEDING,
+          frame_end_bound_expr.get());
+    }
   }
 
   if (frame_start_bound.following) {
@@ -2284,25 +2364,28 @@ std::shared_ptr<Analyzer::Expr> RelAlgTranslator::translateWindowFunction(
           "Window framing starting from following row cannot have preceding rows.");
     }
   }
+
   if (frame_start_bound.is_current_row && frame_end_bound.preceding &&
       !frame_end_bound.unbounded && has_end_bound_frame_expr) {
     throw std::runtime_error(
         "Window framing starting from current row cannot have preceding rows.");
   }
+
+  if (!frame_start_bound_expr &&
+      frame_start_bound_type == SqlWindowFrameBoundType::UNBOUNDED_PRECEDING &&
+      !frame_end_bound_expr &&
+      frame_end_bound_type == SqlWindowFrameBoundType::CURRENT_ROW) {
+    has_framing_clause = false;
+    VLOG(1) << "Ignore range framing mode with a frame bound between "
+               "UNBOUNDED_PRECEDING and CURRENT_ROW";
+  }
+
   if (has_framing_clause) {
     if (frame_mode == Analyzer::WindowFunction::FrameBoundType::RANGE) {
       if (order_keys.size() != 1) {
         throw std::runtime_error(
             "Window framing with range mode requires a single order-by column");
       }
-      if (!frame_start_bound_expr &&
-          frame_start_bound_type == SqlWindowFrameBoundType::UNBOUNDED_PRECEDING &&
-          !frame_end_bound_expr &&
-          frame_end_bound_type == SqlWindowFrameBoundType::CURRENT_ROW) {
-        has_framing_clause = false;
-        VLOG(1) << "Ignore range framing mode with a frame bound between "
-                   "UNBOUNDED_PRECEDING and CURRENT_ROW";
-      }
       std::set<const Analyzer::ColumnVar*,
                bool (*)(const Analyzer::ColumnVar*, const Analyzer::ColumnVar*)>
           colvar_set(Analyzer::ColumnVar::colvar_comp);
@@ -2317,11 +2400,19 @@ std::shared_ptr<Analyzer::Expr> RelAlgTranslator::translateWindowFunction(
       }
     }
   }
-  auto const func_name = ::toString(window_func_kind);
+
+  std::string const func_name = toString(window_func_kind);
   auto const num_args = args.size();
   bool need_order_by_clause = false;
   bool need_frame_def = false;
   switch (window_func_kind) {
+    case SqlWindowFunctionKind::COUNT: {
+      if (has_framing_clause && args.empty()) {
+        args.push_back(
+            makeExpr<Analyzer::Constant>(g_bigint_count ? kBIGINT : kINT, true));
+      }
+      break;
+    }
     case SqlWindowFunctionKind::LEAD_IN_FRAME:
     case SqlWindowFunctionKind::LAG_IN_FRAME: {
       need_order_by_clause = true;
@@ -2402,9 +2493,6 @@ std::shared_ptr<Analyzer::Expr> RelAlgTranslator::translateWindowFunction(
       if (num_args != 2) {
         throw std::runtime_error(func_name + " has an invalid number of input arguments");
       }
-      // NTH_VALUE(_IN_FRAME) may return null value even if the argument is non-null
-      // column
-      ti.set_notnull(false);
       if (window_func_kind == SqlWindowFunctionKind::NTH_VALUE_IN_FRAME) {
         need_order_by_clause = true;
         need_frame_def = true;
@@ -2435,12 +2523,11 @@ std::shared_ptr<Analyzer::Expr> RelAlgTranslator::translateWindowFunction(
     case SqlWindowFunctionKind::CONDITIONAL_CHANGE_EVENT:
       if (order_keys.empty()) {
         throw std::runtime_error(
-            ::toString(window_func_kind) +
-            " requires an ORDER BY sub-clause within the window clause");
+            func_name + " requires an ORDER BY sub-clause within the window clause");
       }
       if (has_framing_clause) {
         LOG(INFO)
-            << ::toString(window_func_kind)
+            << window_func_kind
             << " must use a pre-defined window frame range (e.g., ROWS BETWEEN "
                "UNBOUNDED PRECEDING AND CURRENT ROW). "
                "Thus, we skip the user-defined window frame for this window function";
@@ -2452,22 +2539,22 @@ std::shared_ptr<Analyzer::Expr> RelAlgTranslator::translateWindowFunction(
       break;
     default:;
   }
+
   if (need_order_by_clause && order_keys.empty()) {
     throw std::runtime_error(func_name + " requires an ORDER BY clause");
   }
+
   if (need_frame_def && !has_framing_clause) {
     throw std::runtime_error(func_name + " requires window frame definition");
   }
+
   if (!has_framing_clause) {
     frame_start_bound_type = SqlWindowFrameBoundType::UNKNOWN;
     frame_end_bound_type = SqlWindowFrameBoundType::UNKNOWN;
     frame_start_bound_expr = nullptr;
     frame_end_bound_expr = nullptr;
   }
-  if (window_func_kind == SqlWindowFunctionKind::COUNT && has_framing_clause &&
-      args.empty()) {
-    args.push_back(makeExpr<Analyzer::Constant>(g_bigint_count ? kBIGINT : kINT, true));
-  }
+
   return makeExpr<Analyzer::WindowFunction>(
       ti,
       rex_window_function->getKind(),
@@ -2483,10 +2570,13 @@ std::shared_ptr<Analyzer::Expr> RelAlgTranslator::translateWindowFunction(
 std::shared_ptr<Analyzer::Expr> RelAlgTranslator::translateIntervalExprForWindowFraming(
     std::shared_ptr<Analyzer::Expr> order_key,
     bool for_preceding_bound,
-    const Analyzer::BinOper* frame_bound_expr) const {
+    const Analyzer::Expr* expr) const {
   // translate time interval expression and prepare appropriate frame bound expression:
   // a) manually compute time unit datum: time type
   // b) use dateadd expression: date and timestamp
+  const auto frame_bound_expr = dynamic_cast<const Analyzer::BinOper*>(expr);
+  CHECK(frame_bound_expr);
+  CHECK_EQ(frame_bound_expr->get_optype(), kMULTIPLY);
   const auto order_key_ti = order_key->get_type_info();
   const auto frame_bound_ti = frame_bound_expr->get_type_info();
   const auto time_val_expr =
diff --git a/QueryEngine/RelAlgTranslator.h b/QueryEngine/RelAlgTranslator.h
index 86f222c04b..706c41a99b 100644
--- a/QueryEngine/RelAlgTranslator.h
+++ b/QueryEngine/RelAlgTranslator.h
@@ -161,10 +161,13 @@ class RelAlgTranslator {
   std::shared_ptr<Analyzer::Expr> translateWindowFunction(
       const RexWindowFunctionOperator*) const;
 
+  std::tuple<bool, bool, std::shared_ptr<Analyzer::Expr>> translateFrameBoundExpr(
+      const RexScalar* bound_expr) const;
+
   std::shared_ptr<Analyzer::Expr> translateIntervalExprForWindowFraming(
       std::shared_ptr<Analyzer::Expr> order_key,
       bool for_preceding_bound,
-      const Analyzer::BinOper* frame_bound_expr) const;
+      const Analyzer::Expr* expr) const;
 
   Analyzer::ExpressionPtrVector translateFunctionArgs(const RexFunctionOperator*) const;
 
diff --git a/QueryEngine/RelAlgTranslatorGeo.cpp b/QueryEngine/RelAlgTranslatorGeo.cpp
index 22a449e696..74f8a4cdc6 100644
--- a/QueryEngine/RelAlgTranslatorGeo.cpp
+++ b/QueryEngine/RelAlgTranslatorGeo.cpp
@@ -548,24 +548,10 @@ std::vector<std::shared_ptr<Analyzer::Expr>> RelAlgTranslator::translateGeoFunct
                                 ": second argument is expected to be a literal");
       }
       const auto e = translateLiteral(rex_literal);
-      auto ce = std::dynamic_pointer_cast<Analyzer::Constant>(e);
-      if (!ce || !e->get_type_info().is_integer()) {
+      if (!e ||
+          !shared::is_any<kSMALLINT, kTINYINT, kINT>(e->get_type_info().get_type())) {
         throw QueryNotSupported(rex_function->getName() +
-                                ": expecting integer index as second argument");
-      }
-      int32_t index = 0;
-      if (e->get_type_info().get_type() == kSMALLINT) {
-        index = static_cast<int32_t>(ce->get_constval().smallintval);
-      } else if (e->get_type_info().get_type() == kTINYINT) {
-        index = static_cast<int32_t>(ce->get_constval().tinyintval);
-      } else if (e->get_type_info().get_type() == kINT) {
-        index = static_cast<int32_t>(ce->get_constval().intval);
-      } else {
-        throw QueryNotSupported(rex_function->getName() + " expecting integer index");
-      }
-      if (index == 0) {
-        // maybe we will just return NULL here?
-        throw QueryNotSupported(rex_function->getName() + ": invalid index");
+                                " expecting integer index as second argument");
       }
       arg0.push_back(e);
       auto oper_ti =
@@ -1181,8 +1167,10 @@ std::shared_ptr<Analyzer::Expr> RelAlgTranslator::translateUnaryGeoFunction(
                                            /*is_projection=*/false,
                                            /*use_geo_expressions=*/true);
     CHECK_EQ(geoargs.size(), size_t(1));
+    auto expr_ti = rex_function->getType();
+    expr_ti.set_notnull(arg_ti.get_notnull());
     return makeExpr<Analyzer::GeoOperator>(
-        rex_function->getType(),
+        expr_ti,
         rex_function->getName(),
         std::vector<std::shared_ptr<Analyzer::Expr>>{geoargs.front()});
   } else if (func_resolve(rex_function->getName(), "ST_Perimeter"sv, "ST_Area"sv)) {
diff --git a/QueryEngine/ResultSetIteration.cpp b/QueryEngine/ResultSetIteration.cpp
index 05d89b0d42..8aa64aa9fe 100644
--- a/QueryEngine/ResultSetIteration.cpp
+++ b/QueryEngine/ResultSetIteration.cpp
@@ -2377,8 +2377,11 @@ TargetValue ResultSet::getTargetValueFromBufferRowwise(
           const auto bitmap_byte_sz = count_distinct_desc.sub_bitmap_count == 1
                                           ? count_distinct_desc.bitmapSizeBytes()
                                           : count_distinct_desc.bitmapPaddedSizeBytes();
-          auto count_distinct_buffer = row_set_mem_owner_->allocateCountDistinctBuffer(
-              bitmap_byte_sz, /*thread_idx=*/0);
+          constexpr size_t thread_idx{0};
+          row_set_mem_owner_->initCountDistinctBufferAllocator(bitmap_byte_sz,
+                                                               thread_idx);
+          auto count_distinct_buffer =
+              row_set_mem_owner_->allocateCountDistinctBuffer(bitmap_byte_sz, thread_idx);
           *count_distinct_ptr_ptr = reinterpret_cast<int64_t>(count_distinct_buffer);
         }
       }
diff --git a/QueryEngine/ResultSetRecyclerHolder.h b/QueryEngine/ResultSetRecyclerHolder.h
index f94e3fd0de..e8a9d53035 100644
--- a/QueryEngine/ResultSetRecyclerHolder.h
+++ b/QueryEngine/ResultSetRecyclerHolder.h
@@ -32,18 +32,21 @@ class ResultSetRecyclerHolder {
   static auto markCachedItemAsDirty(size_t table_key) {
     CHECK(query_resultset_cache_);
     CHECK(chunk_metadata_cache_);
-    auto candidate_table_keys =
+    auto resultset_cache_tbl_key =
         query_resultset_cache_->getMappedQueryPlanDagsWithTableKey(table_key);
-    if (candidate_table_keys.has_value()) {
+    if (resultset_cache_tbl_key.has_value()) {
       query_resultset_cache_->markCachedItemAsDirty(
           table_key,
-          *candidate_table_keys,
+          *resultset_cache_tbl_key,
           CacheItemType::QUERY_RESULTSET,
           DataRecyclerUtil::CPU_DEVICE_IDENTIFIER);
-
+    }
+    auto chunk_metadata_cache_tbl_key =
+        chunk_metadata_cache_->getMappedQueryPlanDagsWithTableKey(table_key);
+    if (chunk_metadata_cache_tbl_key.has_value()) {
       chunk_metadata_cache_->markCachedItemAsDirty(
           table_key,
-          *candidate_table_keys,
+          *chunk_metadata_cache_tbl_key,
           CacheItemType::CHUNK_METADATA,
           DataRecyclerUtil::CPU_DEVICE_IDENTIFIER);
     }
diff --git a/QueryEngine/ResultSetReduction.cpp b/QueryEngine/ResultSetReduction.cpp
index fae671ec10..a2799ff5f7 100644
--- a/QueryEngine/ResultSetReduction.cpp
+++ b/QueryEngine/ResultSetReduction.cpp
@@ -164,10 +164,9 @@ void run_reduction_code(const size_t executor_id,
     err = ret.int_val;
   }
   if (err) {
-    if (err == Executor::ERR_SINGLE_VALUE_FOUND_MULTIPLE_VALUES) {
+    if (err == int32_t(heavyai::ErrorCode::SINGLE_VALUE_FOUND_MULTIPLE_VALUES)) {
       throw std::runtime_error("Multiple distinct values encountered");
-    }
-    if (err == Executor::ERR_INTERRUPTED) {
+    } else if (err == int32_t(heavyai::ErrorCode::INTERRUPTED)) {
       throw std::runtime_error(
           "Query execution has interrupted during result set reduction");
     }
diff --git a/QueryEngine/ResultSetReductionJIT.cpp b/QueryEngine/ResultSetReductionJIT.cpp
index 3f030869b9..2219e2c1aa 100644
--- a/QueryEngine/ResultSetReductionJIT.cpp
+++ b/QueryEngine/ResultSetReductionJIT.cpp
@@ -463,8 +463,11 @@ extern "C" RUNTIME_EXPORT void approx_quantile_jit_rt(const int64_t new_set_hand
   auto* incoming = reinterpret_cast<quantile::TDigest*>(new_set_handle);
   if (incoming->centroids().capacity()) {
     auto* accumulator = reinterpret_cast<quantile::TDigest*>(old_set_handle);
-    accumulator->allocate();
-    accumulator->mergeTDigest(*incoming);
+    if (accumulator->centroids().capacity() == 0u) {
+      *accumulator = std::move(*incoming);
+    } else {
+      accumulator->mergeTDigest(*incoming);
+    }
   }
 }
 
diff --git a/QueryEngine/RuntimeFunctions.cpp b/QueryEngine/RuntimeFunctions.cpp
index af83539216..077011d967 100644
--- a/QueryEngine/RuntimeFunctions.cpp
+++ b/QueryEngine/RuntimeFunctions.cpp
@@ -29,6 +29,7 @@
 #include "Utils/SegmentTreeUtils.h"
 
 #include <atomic>
+#include <cfloat>
 #include <chrono>
 #include <cmath>
 #include <cstring>
@@ -913,8 +914,8 @@ inline AGG_TYPE compute_window_func_via_aggregation_tree(
     if (!aggregated_tree_for_partition || query_range_start_idx > query_range_end_idx) { \
       return null_val;                                                                   \
     }                                                                                    \
-    switch (agg_type) {                                                                  \
-      case 1: {                                                                          \
+    switch (static_cast<AggFuncType>(agg_type)) {                                        \
+      case AggFuncType::MIN:                                                             \
         return compute_window_func_via_aggregation_tree<AggFuncType::MIN>(               \
             aggregated_tree_for_partition,                                               \
             query_range_start_idx,                                                       \
@@ -924,8 +925,7 @@ inline AGG_TYPE compute_window_func_via_aggregation_tree(
             std::numeric_limits<agg_value_type>::max(),                                  \
             invalid_val,                                                                 \
             null_val);                                                                   \
-      }                                                                                  \
-      case 2: {                                                                          \
+      case AggFuncType::MAX:                                                             \
         return compute_window_func_via_aggregation_tree<AggFuncType::MAX>(               \
             aggregated_tree_for_partition,                                               \
             query_range_start_idx,                                                       \
@@ -935,8 +935,7 @@ inline AGG_TYPE compute_window_func_via_aggregation_tree(
             std::numeric_limits<agg_value_type>::lowest(),                               \
             invalid_val,                                                                 \
             null_val);                                                                   \
-      }                                                                                  \
-      default: {                                                                         \
+      default:                                                                           \
         return compute_window_func_via_aggregation_tree<AggFuncType::SUM>(               \
             aggregated_tree_for_partition,                                               \
             query_range_start_idx,                                                       \
@@ -946,7 +945,6 @@ inline AGG_TYPE compute_window_func_via_aggregation_tree(
             static_cast<agg_value_type>(0),                                              \
             invalid_val,                                                                 \
             null_val);                                                                   \
-      }                                                                                  \
     }                                                                                    \
   }
 
@@ -1858,6 +1856,7 @@ extern "C" RUNTIME_EXPORT ALWAYS_INLINE void record_error_code(const int32_t err
   }
 }
 
+// error_codes points to an array on GPU, but a single value on CPU.
 extern "C" RUNTIME_EXPORT ALWAYS_INLINE int32_t get_error_code(int32_t* error_codes) {
   return error_codes[pos_start_impl(nullptr)];
 }
@@ -2384,7 +2383,7 @@ extern "C" RUNTIME_EXPORT NEVER_INLINE void linear_probabilistic_count(
 
 // First 3 parameters are output, the rest are input.
 extern "C" RUNTIME_EXPORT NEVER_INLINE void query_stub_hoisted_literals(
-    int32_t* error_code,
+    int32_t* error_codes,
     int32_t* total_matched,
     int64_t** out,
     const uint32_t frag_idx,
@@ -2398,7 +2397,7 @@ extern "C" RUNTIME_EXPORT NEVER_INLINE void query_stub_hoisted_literals(
     const int64_t* join_hash_tables,
     const int8_t* row_func_mgr) {
 #ifndef _WIN32
-  assert(error_code || total_matched || out || frag_idx || row_index_resume ||
+  assert(error_codes || total_matched || out || frag_idx || row_index_resume ||
          col_buffers || literals || num_rows || frag_row_offsets || max_matched ||
          init_agg_value || join_hash_tables || row_func_mgr);
 #endif
@@ -2406,7 +2405,7 @@ extern "C" RUNTIME_EXPORT NEVER_INLINE void query_stub_hoisted_literals(
 
 // First 3 parameters are output, the rest are input.
 extern "C" RUNTIME_EXPORT void multifrag_query_hoisted_literals(
-    int32_t* error_code,
+    int32_t* error_codes,
     int32_t* total_matched,
     int64_t** out,
     const uint32_t* num_fragments_ptr,
@@ -2423,8 +2422,10 @@ extern "C" RUNTIME_EXPORT void multifrag_query_hoisted_literals(
   uint32_t const num_fragments = *num_fragments_ptr;
   uint32_t const num_tables = *num_tables_ptr;
   // num_fragments_ptr and num_tables_ptr are replaced by frag_idx when passed below.
-  for (uint32_t frag_idx = 0; frag_idx < num_fragments; ++frag_idx) {
-    query_stub_hoisted_literals(error_code,
+  for (uint32_t frag_idx = 0;
+       frag_idx < num_fragments && get_error_code(error_codes) == 0;
+       ++frag_idx) {
+    query_stub_hoisted_literals(error_codes,
                                 total_matched,
                                 out,
                                 frag_idx,
@@ -2441,7 +2442,7 @@ extern "C" RUNTIME_EXPORT void multifrag_query_hoisted_literals(
 }
 
 // First 3 parameters are output, the rest are input.
-extern "C" RUNTIME_EXPORT NEVER_INLINE void query_stub(int32_t* error_code,
+extern "C" RUNTIME_EXPORT NEVER_INLINE void query_stub(int32_t* error_codes,
                                                        int32_t* total_matched,
                                                        int64_t** out,
                                                        const uint32_t frag_idx,
@@ -2454,14 +2455,14 @@ extern "C" RUNTIME_EXPORT NEVER_INLINE void query_stub(int32_t* error_code,
                                                        const int64_t* join_hash_tables,
                                                        const int8_t* row_func_mgr) {
 #ifndef _WIN32
-  assert(error_code || total_matched || out || frag_idx || row_index_resume ||
+  assert(error_codes || total_matched || out || frag_idx || row_index_resume ||
          col_buffers || num_rows || frag_row_offsets || max_matched || init_agg_value ||
          join_hash_tables || row_func_mgr);
 #endif
 }
 
 // First 3 parameters are output, the rest are input.
-extern "C" RUNTIME_EXPORT void multifrag_query(int32_t* error_code,
+extern "C" RUNTIME_EXPORT void multifrag_query(int32_t* error_codes,
                                                int32_t* total_matched,
                                                int64_t** out,
                                                const uint32_t* num_fragments_ptr,
@@ -2477,8 +2478,10 @@ extern "C" RUNTIME_EXPORT void multifrag_query(int32_t* error_code,
   uint32_t const num_fragments = *num_fragments_ptr;
   uint32_t const num_tables = *num_tables_ptr;
   // num_fragments_ptr and num_tables_ptr are replaced by frag_idx when passed below.
-  for (uint32_t frag_idx = 0; frag_idx < num_fragments; ++frag_idx) {
-    query_stub(error_code,
+  for (uint32_t frag_idx = 0;
+       frag_idx < num_fragments && get_error_code(error_codes) == 0;
+       ++frag_idx) {
+    query_stub(error_codes,
                total_matched,
                out,
                frag_idx,
@@ -2493,6 +2496,23 @@ extern "C" RUNTIME_EXPORT void multifrag_query(int32_t* error_code,
   }
 }
 
+// WARNING: Don't add #include "Shared/InlineNullValues.h" to this file.
+// It may build fine, but during runtime results in
+// CUDA_ERROR_INVALID_PTX (218): a PTX JIT compilation failed: ptxas application ptx
+// input, line 10; fatal   : Parsing error near '.globl': syntax error
+
+// See spatial_type::Codegen::pointIsNullFunctionName() for selecting
+// which of the following two functions to use to determine point IS NULL.
+extern "C" RUNTIME_EXPORT ALWAYS_INLINE DEVICE bool point_int32_is_null(int32_t* point) {
+  constexpr uint32_t null_array_compressed_32 = 0x80000000U;  // Shared/InlineNullValues.h
+  return point == nullptr || uint32_t(*point) == null_array_compressed_32;
+}
+
+extern "C" RUNTIME_EXPORT ALWAYS_INLINE DEVICE bool point_double_is_null(double* point) {
+  constexpr double null_array_double = 2 * DBL_MIN;  // Shared/InlineNullValues.h
+  return point == nullptr || *point == null_array_double;
+}
+
 extern "C" RUNTIME_EXPORT ALWAYS_INLINE DEVICE bool check_interrupt() {
   if (check_interrupt_init(static_cast<unsigned>(INT_CHECK))) {
     return true;
diff --git a/QueryEngine/StringOpsIR.cpp b/QueryEngine/StringOpsIR.cpp
index a58b4a19b9..20bc465559 100644
--- a/QueryEngine/StringOpsIR.cpp
+++ b/QueryEngine/StringOpsIR.cpp
@@ -631,13 +631,14 @@ llvm::Value* CodeGenerator::codegen(const Analyzer::LikeExpr* expr,
   auto like_expr_arg_lvs = codegen(expr->get_like_expr(), true, co);
   CHECK_EQ(size_t(3), like_expr_arg_lvs.size());
   const bool is_nullable{!expr->get_arg()->get_type_info().get_notnull()};
-  std::vector<llvm::Value*> str_like_args{
-      str_lv[1], str_lv[2], like_expr_arg_lvs[1], like_expr_arg_lvs[2]};
+  std::vector<llvm::Value*> str_like_args{str_lv[1],
+                                          str_lv[2],
+                                          like_expr_arg_lvs[1],
+                                          like_expr_arg_lvs[2],
+                                          cgen_state_->llInt(int8_t(escape_char))};
   std::string fn_name{expr->get_is_ilike() ? "string_ilike" : "string_like"};
   if (expr->get_is_simple()) {
     fn_name += "_simple";
-  } else {
-    str_like_args.push_back(cgen_state_->llInt(int8_t(escape_char)));
   }
   if (is_nullable) {
     fn_name += "_nullable";
@@ -714,12 +715,14 @@ llvm::Value* CodeGenerator::codegenDictLike(
   CHECK_EQ(kENCODING_NONE, pattern_ti.get_compression());
   const auto& pattern_datum = pattern->get_constval();
   const auto& pattern_str = *pattern_datum.stringval;
-  const auto matching_ids = sdp->getLike(pattern_str, ilike, is_simple, escape_char);
-  // InIntegerSet requires 64-bit values
-  std::vector<int64_t> matching_ids_64(matching_ids.size());
-  std::copy(matching_ids.begin(), matching_ids.end(), matching_ids_64.begin());
+  auto work_timer = timer_start();
+  const auto matching_ids =
+      sdp->getLike<int64_t>(pattern_str, ilike, is_simple, escape_char);
+  auto const work_ms = timer_stop(work_timer);
+  VLOG(3) << "Processing like operator with the pattern " << pattern_str << " took "
+          << work_ms << " ms (# matching elems: " << matching_ids.size() << ")";
   const auto in_values = std::make_shared<Analyzer::InIntegerSet>(
-      dict_like_arg, matching_ids_64, dict_like_arg_ti.get_notnull());
+      dict_like_arg, matching_ids, dict_like_arg_ti.get_notnull());
   return codegen(in_values.get(), co);
 }
 
diff --git a/QueryEngine/TableFunctions/SystemFunctions/os/ML/MLModel.h b/QueryEngine/TableFunctions/SystemFunctions/os/ML/MLModel.h
index 7c3257a216..a99dfc3141 100644
--- a/QueryEngine/TableFunctions/SystemFunctions/os/ML/MLModel.h
+++ b/QueryEngine/TableFunctions/SystemFunctions/os/ML/MLModel.h
@@ -30,6 +30,7 @@
 
 #ifdef HAVE_ONEDAL
 #include "daal.h"
+#include "oneapi/dal/algo/decision_forest.hpp"
 #endif
 
 class MLModelMap {
@@ -148,11 +149,28 @@ class LinearRegressionModel : public AbstractMLModel {
   std::vector<double> coefs_;
 };
 
+// In scenarios where oneDAL is not available, users still need a full definition of
+// AbstractTreeModel to compile.
+class TreeModelVisitor;
+
+class AbstractTreeModel : public virtual AbstractMLModel {
+ public:
+  virtual MLModelType getModelType() const = 0;
+  virtual std::string getModelTypeString() const = 0;
+  virtual int64_t getNumFeatures() const = 0;
+  virtual int64_t getNumTrees() const = 0;
+  virtual ~AbstractTreeModel() = default;
+  virtual void traverseDF(const int64_t tree_idx,
+                          TreeModelVisitor& tree_node_visitor) const = 0;
+};
+
 #ifdef HAVE_ONEDAL
 
 using namespace daal::algorithms;
 using namespace daal::data_management;
 
+namespace df = oneapi::dal::decision_forest;
+
 class TreeModelVisitor : public daal::algorithms::regression::TreeNodeVisitor {
  public:
   TreeModelVisitor(std::vector<DecisionTreeEntry>& decision_table)
@@ -188,23 +206,38 @@ class TreeModelVisitor : public daal::algorithms::regression::TreeNodeVisitor {
     return true;
   }
 
+  bool operator()(const df::leaf_node_info<df::task::regression>& info) {
+    decision_table_.emplace_back(DecisionTreeEntry(info.get_response()));
+    if (last_node_leaf_) {
+      decision_table_[parent_nodes_.top()].right_child_row_idx =
+          static_cast<int64_t>(decision_table_.size() - 1);
+      parent_nodes_.pop();
+    }
+    last_node_leaf_ = true;
+    return true;
+  }
+
+  bool operator()(const df::split_node_info<df::task::regression>& info) {
+    decision_table_.emplace_back(
+        DecisionTreeEntry(info.get_feature_value(),
+                          static_cast<int64_t>(info.get_feature_index()),
+                          static_cast<int64_t>(decision_table_.size() + 1)));
+    if (last_node_leaf_) {
+      decision_table_[parent_nodes_.top()].right_child_row_idx =
+          static_cast<int64_t>(decision_table_.size() - 1);
+      parent_nodes_.pop();
+    }
+    last_node_leaf_ = false;
+    parent_nodes_.emplace(decision_table_.size() - 1);
+    return true;
+  }
+
  private:
   std::vector<DecisionTreeEntry>& decision_table_;
   std::stack<size_t> parent_nodes_;
   bool last_node_leaf_{false};
 };
 
-class AbstractTreeModel : public virtual AbstractMLModel {
- public:
-  virtual MLModelType getModelType() const = 0;
-  virtual std::string getModelTypeString() const = 0;
-  virtual int64_t getNumFeatures() const = 0;
-  virtual int64_t getNumTrees() const = 0;
-  virtual void traverseDF(const int64_t tree_idx,
-                          TreeModelVisitor& tree_node_visitor) const = 0;
-  virtual ~AbstractTreeModel() = default;
-};
-
 class DecisionTreeRegressionModel : public virtual AbstractTreeModel {
  public:
   DecisionTreeRegressionModel(decision_tree::regression::interface1::ModelPtr& model_ptr,
@@ -272,7 +305,13 @@ class GbtRegressionModel : public virtual AbstractTreeModel {
   gbt::regression::interface1::ModelPtr model_ptr_;
 };
 
-class RandomForestRegressionModel : public virtual AbstractTreeModel {
+class AbstractRandomForestModel : public virtual AbstractTreeModel {
+ public:
+  virtual const std::vector<double>& getVariableImportanceScores() const = 0;
+  virtual const double getOutOfBagError() const = 0;
+};
+
+class RandomForestRegressionModel : public virtual AbstractRandomForestModel {
  public:
   RandomForestRegressionModel(
       decision_forest::regression::interface1::ModelPtr& model_ptr,
@@ -311,20 +350,78 @@ class RandomForestRegressionModel : public virtual AbstractTreeModel {
     model_ptr_->traverseDF(tree_idx, tree_node_visitor);
   }
 
+  virtual const std::vector<double>& getVariableImportanceScores() const override {
+    return variable_importance_;
+  }
+
+  virtual const double getOutOfBagError() const override { return out_of_bag_error_; }
+
   const decision_forest::regression::interface1::ModelPtr getModelPtr() const {
     return model_ptr_;
   }
 
-  const std::vector<double>& getVariableImportanceScores() const {
+ private:
+  decision_forest::regression::interface1::ModelPtr model_ptr_;
+  std::vector<double> variable_importance_;
+  double out_of_bag_error_;
+};
+
+class OneAPIRandomForestRegressionModel : public virtual AbstractRandomForestModel {
+ public:
+  OneAPIRandomForestRegressionModel(
+      const std::shared_ptr<const df::model<df::task::regression>> model,
+      const std::string& model_metadata,
+      const std::vector<double>& variable_importance,
+      const double out_of_bag_error,
+      const int64_t num_features)
+      : AbstractMLModel(model_metadata)
+      , model_(std::move(model))
+      , variable_importance_(variable_importance)
+      , out_of_bag_error_(out_of_bag_error)
+      , num_features_(num_features) {}
+
+  OneAPIRandomForestRegressionModel(
+      const std::shared_ptr<const df::model<df::task::regression>> model,
+      const std::string& model_metadata,
+      const std::vector<std::vector<std::string>>& cat_feature_keys,
+      const std::vector<double>& variable_importance,
+      const double out_of_bag_error,
+      const int64_t num_features)
+      : AbstractMLModel(model_metadata, cat_feature_keys)
+      , model_(std::move(model))
+      , variable_importance_(variable_importance)
+      , out_of_bag_error_(out_of_bag_error)
+      , num_features_(num_features) {}
+
+  virtual MLModelType getModelType() const override {
+    return MLModelType::RANDOM_FOREST_REG;
+  }
+
+  virtual std::string getModelTypeString() const override {
+    return "Random Forest Regression";
+  }
+  virtual int64_t getNumFeatures() const override { return num_features_; }
+  virtual int64_t getNumTrees() const override { return model_->get_tree_count(); }
+  virtual void traverseDF(const int64_t tree_idx,
+                          TreeModelVisitor& tree_node_visitor) const override {
+    model_->traverse_depth_first(tree_idx, tree_node_visitor);
+  }
+
+  virtual const std::vector<double>& getVariableImportanceScores() const override {
     return variable_importance_;
   }
 
-  const double getOutOfBagError() const { return out_of_bag_error_; }
+  virtual const double getOutOfBagError() const override { return out_of_bag_error_; }
+
+  const std::shared_ptr<const df::model<df::task::regression>> getModel() const {
+    return model_;
+  }
 
  private:
-  decision_forest::regression::interface1::ModelPtr model_ptr_;
+  const std::shared_ptr<const df::model<df::task::regression>> model_;
   std::vector<double> variable_importance_;
   double out_of_bag_error_;
+  int64_t num_features_;  // oneapi::df::models do not store number of features
 };
 
 #endif  // #ifdef HAVE_ONEDAL
diff --git a/QueryEngine/TableFunctions/SystemFunctions/os/ML/MLTableFunctionsCommon.h b/QueryEngine/TableFunctions/SystemFunctions/os/ML/MLTableFunctionsCommon.h
index 23cbce6650..e8d94adcd8 100644
--- a/QueryEngine/TableFunctions/SystemFunctions/os/ML/MLTableFunctionsCommon.h
+++ b/QueryEngine/TableFunctions/SystemFunctions/os/ML/MLTableFunctionsCommon.h
@@ -20,13 +20,14 @@
 
 #include <map>
 
-enum class MLFramework { DEFAULT, ONEDAL, MLPACK, INVALID };
+enum class MLFramework { DEFAULT, ONEDAL, ONEAPI, MLPACK, INVALID };
 
 inline MLFramework get_ml_framework(const std::string& ml_framework_str) {
   const auto upper_ml_framework_str = to_upper(ml_framework_str);
   const static std::map<std::string, MLFramework> ml_framework_map = {
       {"DEFAULT", MLFramework::DEFAULT},
       {"ONEDAL", MLFramework::ONEDAL},
+      {"ONEAPI", MLFramework::ONEAPI},
       {"MLPACK", MLFramework::MLPACK}};
   const auto itr = ml_framework_map.find(upper_ml_framework_str);
   if (itr == ml_framework_map.end()) {
diff --git a/QueryEngine/TableFunctions/SystemFunctions/os/ML/OneAPIFunctions.hpp b/QueryEngine/TableFunctions/SystemFunctions/os/ML/OneAPIFunctions.hpp
new file mode 100644
index 0000000000..e63a9b9501
--- /dev/null
+++ b/QueryEngine/TableFunctions/SystemFunctions/os/ML/OneAPIFunctions.hpp
@@ -0,0 +1,446 @@
+/*
+ * Copyright 2023 HEAVY.AI, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifndef __CUDACC__
+#ifdef HAVE_ONEDAL
+
+#include <cstring>
+
+#include "MLModel.h"
+#include "QueryEngine/TableFunctions/SystemFunctions/os/ML/MLTableFunctionsCommon.h"
+#include "QueryEngine/heavydbTypes.h"
+
+#include "oneapi/dal/algo/dbscan.hpp"
+#include "oneapi/dal/algo/decision_forest.hpp"
+#include "oneapi/dal/algo/kmeans.hpp"
+#include "oneapi/dal/algo/kmeans_init.hpp"
+#include "oneapi/dal/algo/linear_regression.hpp"
+#include "oneapi/dal/algo/pca.hpp"
+#include "oneapi/dal/array.hpp"
+#include "oneapi/dal/io/csv.hpp"
+#include "oneapi/dal/table/common.hpp"
+#include "oneapi/dal/table/homogen.hpp"
+#include "oneapi/dal/table/row_accessor.hpp"
+
+#include <iostream>
+
+namespace dal = oneapi::dal;
+
+inline std::ostream& operator<<(std::ostream& stream, const dal::table& table) {
+  if (!table.has_data())
+    return stream;
+
+  auto arr = dal::row_accessor<const float>(table).pull();
+  const auto x = arr.get_data();
+  const std::int32_t precision =
+      dal::detail::is_floating_point(table.get_metadata().get_data_type(0)) ? 3 : 0;
+
+  if (table.get_row_count() <= 10) {
+    for (std::int64_t i = 0; i < table.get_row_count(); i++) {
+      for (std::int64_t j = 0; j < table.get_column_count(); j++) {
+        stream << std::setw(10) << std::setiosflags(std::ios::fixed)
+               << std::setprecision(precision) << x[i * table.get_column_count() + j];
+      }
+      stream << std::endl;
+    }
+  } else {
+    for (std::int64_t i = 0; i < 5; i++) {
+      for (std::int64_t j = 0; j < table.get_column_count(); j++) {
+        stream << std::setw(10) << std::setiosflags(std::ios::fixed)
+               << std::setprecision(precision) << x[i * table.get_column_count() + j];
+      }
+      stream << std::endl;
+    }
+    stream << "..." << (table.get_row_count() - 10) << " lines skipped..." << std::endl;
+    for (std::int64_t i = table.get_row_count() - 5; i < table.get_row_count(); i++) {
+      for (std::int64_t j = 0; j < table.get_column_count(); j++) {
+        stream << std::setw(10) << std::setiosflags(std::ios::fixed)
+               << std::setprecision(precision) << x[i * table.get_column_count() + j];
+      }
+      stream << std::endl;
+    }
+  }
+  return stream;
+}
+
+template <typename T>
+const dal::table prepare_oneapi_data_table(const T* data, const int64_t num_rows) {
+  auto data_arr = dal::array<T>::empty(num_rows);
+  std::copy(data, data + num_rows, data_arr.get_mutable_data());
+  const auto data_table =
+      dal::homogen_table::wrap(data_arr, num_rows, 1, dal::data_layout::column_major);
+  return data_table;
+}
+
+template <typename T>
+const dal::table prepare_oneapi_data_table(const std::vector<const T*>& data,
+                                           const int64_t num_rows) {
+  const size_t num_columns = data.size();
+  auto data_arr = dal::array<T>::empty(num_rows * num_columns);
+  T* raw_ptr = data_arr.get_mutable_data();
+  for (size_t i = 0; i < num_columns; ++i) {
+    const T* column_ptr = data[i];
+    for (int64_t j = 0; j < num_rows; ++j) {
+      raw_ptr[j * num_columns + i] = column_ptr[j];
+    }
+  }
+  return dal::homogen_table::wrap(data_arr, num_rows, num_columns);
+}
+
+template <typename T>
+const dal::table prepare_oneapi_pivoted_data_table(const T* data,
+                                                   const int64_t num_elems) {
+  auto data_arr = dal::array<T>::empty(num_elems);
+  std::copy(data, data + num_elems, data_arr.get_mutable_data());
+  return dal::homogen_table::wrap(data_arr, 1, num_elems);
+}
+
+template <typename T>
+auto init_centroids_oneapi(const KMeansInitStrategy init_type,
+                           const int num_clusters,
+                           const dal::table features_table) {
+  switch (init_type) {
+    case KMeansInitStrategy::DEFAULT:
+    case KMeansInitStrategy::DETERMINISTIC: {
+      const auto kmeans_init_desc =
+          dal::kmeans_init::descriptor<T, dal::kmeans_init::method::dense>()
+              .set_cluster_count(num_clusters);
+      return dal::compute(kmeans_init_desc, features_table);
+    }
+    case KMeansInitStrategy::RANDOM: {
+      const auto kmeans_init_desc =
+          dal::kmeans_init::descriptor<T, dal::kmeans_init::method::random_dense>()
+              .set_cluster_count(num_clusters);
+      return dal::compute(kmeans_init_desc, features_table);
+    }
+    case KMeansInitStrategy::PLUS_PLUS: {
+      const auto kmeans_init_desc =
+          dal::kmeans_init::descriptor<T, dal::kmeans_init::method::parallel_plus_dense>()
+              .set_cluster_count(num_clusters);
+      return dal::compute(kmeans_init_desc, features_table);
+    }
+    default: {
+      throw std::runtime_error(
+          "Invalid Kmeans cluster centroid init type. Was expecting one of "
+          "DETERMINISTIC, RANDOM, PLUS_PLUS.");
+    }
+  }
+}
+
+template <typename T>
+NEVER_INLINE HOST int32_t
+onedal_oneapi_kmeans_impl(const std::vector<const T*>& input_features,
+                          int32_t* output_clusters,
+                          const int64_t num_rows,
+                          const int num_clusters,
+                          const int num_iterations,
+                          const KMeansInitStrategy kmeans_init_type) {
+  try {
+    const auto features_table = prepare_oneapi_data_table(input_features, num_rows);
+    const auto result_init =
+        init_centroids_oneapi<T>(kmeans_init_type, num_clusters, features_table);
+
+    const auto kmeans_desc = dal::kmeans::descriptor<>()
+                                 .set_cluster_count(num_clusters)
+                                 .set_max_iteration_count(num_iterations)
+                                 .set_accuracy_threshold(0.001);
+    const dal::kmeans::train_result result_train =
+        dal::train(kmeans_desc, features_table, result_init.get_centroids());
+    auto arr = dal::row_accessor<const int32_t>(result_train.get_responses()).pull();
+    const auto x = arr.get_data();
+    std::memcpy(output_clusters, x, num_rows * sizeof(int32_t));
+  } catch (const std::exception& e) {
+    throw std::runtime_error(e.what());
+  }
+
+  return num_rows;
+}
+
+template <typename T>
+NEVER_INLINE HOST int32_t
+onedal_oneapi_dbscan_impl(const std::vector<const T*>& input_features,
+                          int32_t* output_clusters,
+                          const int64_t num_rows,
+                          const double epsilon,
+                          const int32_t min_observations) {
+  try {
+    const auto features_table = prepare_oneapi_data_table(input_features, num_rows);
+    auto dbscan_desc = dal::dbscan::descriptor<>(epsilon, min_observations);
+    dbscan_desc.set_result_options(dal::dbscan::result_options::responses);
+    const auto result_compute = dal::compute(dbscan_desc, features_table);
+
+    auto arr = dal::row_accessor<const int32_t>(result_compute.get_responses()).pull();
+    const auto x = arr.get_data();
+    std::memcpy(output_clusters, x, num_rows * sizeof(int32_t));
+  } catch (const std::exception& e) {
+    throw std::runtime_error(e.what());
+  }
+  return num_rows;
+}
+
+template <typename T>
+NEVER_INLINE HOST std::pair<std::vector<std::vector<T>>, std::vector<T>>
+onedal_oneapi_pca_impl(const std::vector<const T*>& input_features,
+                       const int64_t num_rows) {
+  try {
+    // TODO: Do we want to parameterize PCA to allow using SVD other than default COV?
+    const auto pca_desc =
+        dal::pca::descriptor<T, dal::pca::method::cov>().set_deterministic(true);
+    const auto features_table = prepare_oneapi_data_table(input_features, num_rows);
+
+    const auto result_train = dal::train(pca_desc, features_table);
+
+    auto eigenvectors_table_asarray =
+        dal::row_accessor<const T>(result_train.get_eigenvectors()).pull();
+    const auto eigenvectors_data = eigenvectors_table_asarray.get_data();
+    const int64_t num_dims = result_train.get_eigenvectors().get_row_count();
+    std::vector<std::vector<T>> eigenvectors(num_dims, std::vector<T>(num_dims));
+    for (std::int64_t i = 0; i < num_dims; i++) {
+      for (std::int64_t j = 0; j < num_dims; j++) {
+        eigenvectors[i][j] = eigenvectors_data[i * num_dims + j];
+      }
+    }
+
+    auto eigenvalues_table_asarray =
+        dal::row_accessor<const T>(result_train.get_eigenvalues()).pull();
+    const auto eigenvalues_data = eigenvalues_table_asarray.get_data();
+    std::vector<T> eigenvalues(eigenvalues_data, eigenvalues_data + num_dims);
+
+    return std::make_pair(eigenvectors, eigenvalues);
+  } catch (std::exception& e) {
+    throw std::runtime_error(e.what());
+  }
+}
+
+template <typename T>
+int32_t extract_model_coefs(const dal::table& coefs_table,
+                            int64_t* coef_idxs,
+                            double* coefs) {
+  const int64_t num_coefs = coefs_table.get_column_count();
+
+  auto coefs_table_data = dal::row_accessor<const float>(coefs_table).pull().get_data();
+  for (int64_t coef_idx = 0; coef_idx < num_coefs; ++coef_idx) {
+    coef_idxs[coef_idx] = coef_idx;
+    coefs[coef_idx] = coefs_table_data[coef_idx];
+  }
+
+  return num_coefs;
+}
+
+template <typename T>
+NEVER_INLINE HOST int32_t
+onedal_oneapi_linear_reg_fit_impl(const T* input_labels,
+                                  const std::vector<const T*>& input_features,
+                                  int64_t* output_coef_idxs,
+                                  double* output_coefs,
+                                  const int64_t num_rows) {
+  try {
+    const auto labels_table = prepare_oneapi_data_table(input_labels, num_rows);
+    const auto features_table = prepare_oneapi_data_table(input_features, num_rows);
+
+    const auto lr_descriptor = dal::linear_regression::descriptor<>().set_result_options(
+        dal::linear_regression::result_options::coefficients |
+        dal::linear_regression::result_options::intercept);
+    const auto train_result = dal::train(lr_descriptor, features_table, labels_table);
+
+    return extract_model_coefs<T>(train_result.get_model().get_packed_coefficients(),
+                                  output_coef_idxs,
+                                  output_coefs);
+  } catch (std::exception& e) {
+    throw std::runtime_error(e.what());
+  }
+}
+
+template <typename T>
+NEVER_INLINE HOST int32_t
+onedal_oneapi_linear_reg_predict_impl(const std::shared_ptr<LinearRegressionModel>& model,
+                                      const std::vector<const T*>& input_features,
+                                      T* output_predictions,
+                                      const int64_t num_rows) {
+  CHECK(model->getModelType() == MLModelType::LINEAR_REG);
+  try {
+    if (model->getNumFeatures() != static_cast<int64_t>(input_features.size())) {
+      throw std::runtime_error(
+          "Number of model coefficients does not match number of input features.");
+    }
+
+    const auto model_coefs = prepare_oneapi_pivoted_data_table(model->getCoefs().data(),
+                                                               input_features.size() + 1);
+    auto lr_model = dal::linear_regression::model();
+    lr_model.set_packed_coefficients(model_coefs);
+
+    const auto features_table = prepare_oneapi_data_table(input_features, num_rows);
+    const auto lr_descriptor = dal::linear_regression::descriptor<>().set_result_options(
+        dal::linear_regression::result_options::coefficients |
+        dal::linear_regression::result_options::intercept);
+    const auto test_result = dal::infer(lr_descriptor, features_table, lr_model);
+
+    // For some reason if we construct the dal::row_accessor separately to then copy the
+    // memory later, the underlying array's destructor gets called and its memory is
+    // freed, so we construct it in-place instead.
+    std::memcpy(output_predictions,
+                dal::row_accessor<const T>(test_result.get_responses()).pull().get_data(),
+                num_rows * sizeof(T));
+    return num_rows;
+  } catch (std::exception& e) {
+    throw std::runtime_error(e.what());
+  }
+}
+
+inline dal::decision_forest::variable_importance_mode
+get_oneapi_var_importance_metric_type(const VarImportanceMetric var_importance_metric) {
+  switch (var_importance_metric) {
+    case VarImportanceMetric::NONE:
+      return dal::decision_forest::variable_importance_mode::none;
+    case VarImportanceMetric::DEFAULT:
+    case VarImportanceMetric::MDI:
+      return dal::decision_forest::variable_importance_mode::mdi;
+    case VarImportanceMetric::MDA:
+      return dal::decision_forest::variable_importance_mode::mda_raw;
+    case VarImportanceMetric::MDA_SCALED:
+      return dal::decision_forest::variable_importance_mode::mda_scaled;
+    default: {
+      std::ostringstream oss;
+      oss << "Invalid variable importance mode type. "
+          << "Was expecting one of DEFAULT, NONE, MDI, MDA, or MDA_SCALED.";
+      throw std::runtime_error(oss.str());
+    }
+  }
+}
+
+template <typename T, typename Method>
+NEVER_INLINE HOST void onedal_oneapi_random_forest_reg_fit_impl(
+    const std::string& model_name,
+    const T* input_labels,
+    const std::vector<const T*>& input_features,
+    const std::string& model_metadata,
+    const std::vector<std::vector<std::string>>& cat_feature_keys,
+    const int64_t num_rows,
+    const int64_t num_trees,
+    const double obs_per_tree_fraction,
+    const int64_t max_tree_depth,
+    const int64_t features_per_node,
+    const double impurity_threshold,
+    const bool bootstrap,
+    const int64_t min_obs_per_leaf_node,
+    const int64_t min_obs_per_split_node,
+    const double min_weight_fraction_in_leaf_node,
+    const double min_impurity_decrease_in_split_node,
+    const int64_t max_leaf_nodes,
+    const VarImportanceMetric var_importance_metric) {
+  constexpr bool compute_out_of_bag_error{false};
+  try {
+    const auto features_table = prepare_oneapi_data_table(input_features, num_rows);
+    const auto labels_table = prepare_oneapi_data_table(input_labels, num_rows);
+
+    const auto error_metric =
+        compute_out_of_bag_error
+            ? dal::decision_forest::error_metric_mode::out_of_bag_error
+            : dal::decision_forest::error_metric_mode::none;
+
+    const auto importance_metric =
+        get_oneapi_var_importance_metric_type(var_importance_metric);
+
+    auto df_desc =
+        dal::decision_forest::descriptor<T,
+                                         Method,
+                                         dal::decision_forest::task::regression>{}
+            .set_tree_count(num_trees)
+            .set_observations_per_tree_fraction(obs_per_tree_fraction)
+            .set_max_tree_depth(max_tree_depth)
+            .set_features_per_node(features_per_node)
+            .set_impurity_threshold(impurity_threshold)
+            .set_bootstrap(bootstrap)
+            .set_min_observations_in_leaf_node(min_obs_per_leaf_node)
+            .set_min_observations_in_split_node(min_obs_per_split_node)
+            .set_min_weight_fraction_in_leaf_node(min_weight_fraction_in_leaf_node)
+            .set_min_impurity_decrease_in_split_node(min_impurity_decrease_in_split_node)
+            .set_max_leaf_nodes(max_leaf_nodes)
+            .set_error_metric_mode(error_metric)
+            .set_variable_importance_mode(importance_metric);
+
+    const auto result_train = dal::train(df_desc, features_table, labels_table);
+
+    const size_t num_features = input_features.size();
+    std::vector<double> variable_importance(
+        var_importance_metric != VarImportanceMetric::NONE ? num_features : 0);
+    if (var_importance_metric != VarImportanceMetric::NONE) {
+      auto var_importance_data =
+          dal::row_accessor<const T>(result_train.get_var_importance()).pull().get_data();
+      for (size_t feature_idx = 0; feature_idx < num_features; ++feature_idx) {
+        variable_importance[feature_idx] = var_importance_data[feature_idx];
+      }
+    }
+
+    double out_of_bag_error{0};
+    if (compute_out_of_bag_error) {
+      auto oob_error_data =
+          dal::row_accessor<const T>(result_train.get_oob_err()).pull().get_data();
+      out_of_bag_error = oob_error_data[0];
+    }
+
+    auto abstract_model = std::make_shared<OneAPIRandomForestRegressionModel>(
+        std::make_shared<df::model<df::task::regression>>(result_train.get_model()),
+        model_metadata,
+        cat_feature_keys,
+        variable_importance,
+        out_of_bag_error,
+        num_features);
+    g_ml_models.addModel(model_name, abstract_model);
+  } catch (std::exception& e) {
+    throw std::runtime_error(e.what());
+  }
+}
+
+template <typename T>
+NEVER_INLINE HOST int32_t onedal_oneapi_random_forest_reg_predict_impl(
+    const std::shared_ptr<OneAPIRandomForestRegressionModel>& model,
+    const std::vector<const T*>& input_features,
+    T* output_predictions,
+    const int64_t num_rows) {
+  CHECK(model->getModelType() == MLModelType::RANDOM_FOREST_REG);
+  try {
+    if (model->getNumFeatures() != static_cast<int64_t>(input_features.size())) {
+      throw std::runtime_error("Number of provided features does not match model.");
+    }
+    const auto features_table = prepare_oneapi_data_table(input_features, num_rows);
+
+    // oneAPI's ::infer method expects a decision_forest::descriptor argument as input.
+    // The descriptor seems to have no effect on how the pre-trained model is executed
+    // though, so we pass a dummy descriptor rather than storing the descriptor originally
+    // used to train the model unnecessarily
+    auto dummy_desc =
+        dal::decision_forest::descriptor<T,
+                                         dal::decision_forest::method::hist,
+                                         dal::decision_forest::task::regression>{};
+
+    const auto result_infer =
+        dal::infer(dummy_desc, *(model->getModel()), features_table);
+
+    auto result_table_data =
+        dal::row_accessor<const T>(result_infer.get_responses()).pull().get_data();
+    std::memcpy(output_predictions, result_table_data, num_rows * sizeof(T));
+
+    return num_rows;
+  } catch (std::exception& e) {
+    throw std::runtime_error(e.what());
+  }
+}
+
+#endif  // #ifdef HAVE_ONEDAL
+#endif  // #ifdef __CUDACC__
diff --git a/QueryEngine/TableFunctions/SystemFunctions/os/ML/OneDalFunctions.hpp b/QueryEngine/TableFunctions/SystemFunctions/os/ML/OneDalFunctions.hpp
index e353452424..53386a6ba6 100644
--- a/QueryEngine/TableFunctions/SystemFunctions/os/ML/OneDalFunctions.hpp
+++ b/QueryEngine/TableFunctions/SystemFunctions/os/ML/OneDalFunctions.hpp
@@ -19,19 +19,320 @@
 #ifndef __CUDACC__
 #ifdef HAVE_ONEDAL
 
+#include <cstring>
+
 #include "MLModel.h"
 #include "QueryEngine/TableFunctions/SystemFunctions/os/ML/MLTableFunctionsCommon.h"
 #include "QueryEngine/heavydbTypes.h"
 #include "daal.h"
 
+#include <iomanip>
+#include <iostream>
+
 using namespace daal::algorithms;
 using namespace daal::data_management;
 
+inline void printAprioriItemsets(
+    daal::data_management::NumericTablePtr largeItemsetsTable,
+    daal::data_management::NumericTablePtr largeItemsetsSupportTable,
+    size_t nItemsetToPrint = 20) {
+  using namespace daal::data_management;
+
+  size_t largeItemsetCount = largeItemsetsSupportTable->getNumberOfRows();
+  size_t nItemsInLargeItemsets = largeItemsetsTable->getNumberOfRows();
+
+  BlockDescriptor<int> block1;
+  largeItemsetsTable->getBlockOfRows(0, nItemsInLargeItemsets, readOnly, block1);
+  int* largeItemsets = block1.getBlockPtr();
+
+  BlockDescriptor<int> block2;
+  largeItemsetsSupportTable->getBlockOfRows(0, largeItemsetCount, readOnly, block2);
+  int* largeItemsetsSupportData = block2.getBlockPtr();
+
+  std::vector<std::vector<size_t>> largeItemsetsVector;
+  largeItemsetsVector.resize(largeItemsetCount);
+
+  for (size_t i = 0; i < nItemsInLargeItemsets; i++) {
+    largeItemsetsVector[largeItemsets[2 * i]].push_back(largeItemsets[2 * i + 1]);
+  }
+
+  std::vector<size_t> supportVector;
+  supportVector.resize(largeItemsetCount);
+
+  for (size_t i = 0; i < largeItemsetCount; i++) {
+    supportVector[largeItemsetsSupportData[2 * i]] = largeItemsetsSupportData[2 * i + 1];
+  }
+
+  std::cout << std::endl << "Apriori example program results" << std::endl;
+
+  std::cout << std::endl
+            << "Last " << nItemsetToPrint << " large itemsets: " << std::endl;
+  std::cout << std::endl
+            << "Itemset"
+            << "\t\t\tSupport" << std::endl;
+
+  size_t iMin = (((largeItemsetCount > nItemsetToPrint) && (nItemsetToPrint != 0))
+                     ? largeItemsetCount - nItemsetToPrint
+                     : 0);
+  for (size_t i = iMin; i < largeItemsetCount; i++) {
+    std::cout << "{";
+    for (size_t l = 0; l < largeItemsetsVector[i].size() - 1; l++) {
+      std::cout << largeItemsetsVector[i][l] << ", ";
+    }
+    std::cout << largeItemsetsVector[i][largeItemsetsVector[i].size() - 1] << "}\t\t";
+
+    std::cout << supportVector[i] << std::endl;
+  }
+
+  largeItemsetsTable->releaseBlockOfRows(block1);
+  largeItemsetsSupportTable->releaseBlockOfRows(block2);
+}
+
+inline void printAprioriRules(daal::data_management::NumericTablePtr leftItemsTable,
+                              daal::data_management::NumericTablePtr rightItemsTable,
+                              daal::data_management::NumericTablePtr confidenceTable,
+                              size_t nRulesToPrint = 20) {
+  using namespace daal::data_management;
+
+  size_t nRules = confidenceTable->getNumberOfRows();
+  size_t nLeftItems = leftItemsTable->getNumberOfRows();
+  size_t nRightItems = rightItemsTable->getNumberOfRows();
+
+  BlockDescriptor<int> block1;
+  leftItemsTable->getBlockOfRows(0, nLeftItems, readOnly, block1);
+  int* leftItems = block1.getBlockPtr();
+
+  BlockDescriptor<int> block2;
+  rightItemsTable->getBlockOfRows(0, nRightItems, readOnly, block2);
+  int* rightItems = block2.getBlockPtr();
+
+  BlockDescriptor<DAAL_DATA_TYPE> block3;
+  confidenceTable->getBlockOfRows(0, nRules, readOnly, block3);
+  DAAL_DATA_TYPE* confidence = block3.getBlockPtr();
+
+  std::vector<std::vector<size_t>> leftItemsVector;
+  leftItemsVector.resize(nRules);
+
+  if (nRules == 0) {
+    std::cout << std::endl << "No association rules were found " << std::endl;
+    return;
+  }
+
+  for (size_t i = 0; i < nLeftItems; i++) {
+    leftItemsVector[leftItems[2 * i]].push_back(leftItems[2 * i + 1]);
+  }
+
+  std::vector<std::vector<size_t>> rightItemsVector;
+  rightItemsVector.resize(nRules);
+
+  for (size_t i = 0; i < nRightItems; i++) {
+    rightItemsVector[rightItems[2 * i]].push_back(rightItems[2 * i + 1]);
+  }
+
+  std::vector<DAAL_DATA_TYPE> confidenceVector;
+  confidenceVector.resize(nRules);
+
+  for (size_t i = 0; i < nRules; i++) {
+    confidenceVector[i] = confidence[i];
+  }
+
+  std::cout << std::endl
+            << "Last " << nRulesToPrint << " association rules: " << std::endl;
+  std::cout << std::endl
+            << "Rule"
+            << "\t\t\t\tConfidence" << std::endl;
+  size_t iMin =
+      (((nRules > nRulesToPrint) && (nRulesToPrint != 0)) ? (nRules - nRulesToPrint) : 0);
+
+  for (size_t i = iMin; i < nRules; i++) {
+    std::cout << "{";
+    for (size_t l = 0; l < leftItemsVector[i].size() - 1; l++) {
+      std::cout << leftItemsVector[i][l] << ", ";
+    }
+    std::cout << leftItemsVector[i][leftItemsVector[i].size() - 1] << "} => {";
+
+    for (size_t l = 0; l < rightItemsVector[i].size() - 1; l++) {
+      std::cout << rightItemsVector[i][l] << ", ";
+    }
+    std::cout << rightItemsVector[i][rightItemsVector[i].size() - 1] << "}\t\t";
+
+    std::cout << confidenceVector[i] << std::endl;
+  }
+
+  leftItemsTable->releaseBlockOfRows(block1);
+  rightItemsTable->releaseBlockOfRows(block2);
+  confidenceTable->releaseBlockOfRows(block3);
+}
+
+inline bool isFull(daal::data_management::NumericTableIface::StorageLayout layout) {
+  int layoutInt = (int)layout;
+  if (daal::data_management::packed_mask & layoutInt) {
+    return false;
+  }
+  return true;
+}
+
+inline bool isUpper(daal::data_management::NumericTableIface::StorageLayout layout) {
+  using daal::data_management::NumericTableIface;
+
+  if (layout == NumericTableIface::upperPackedSymmetricMatrix ||
+      layout == NumericTableIface::upperPackedTriangularMatrix) {
+    return true;
+  }
+  return false;
+}
+
+inline bool isLower(daal::data_management::NumericTableIface::StorageLayout layout) {
+  using daal::data_management::NumericTableIface;
+
+  if (layout == NumericTableIface::lowerPackedSymmetricMatrix ||
+      layout == NumericTableIface::lowerPackedTriangularMatrix) {
+    return true;
+  }
+  return false;
+}
+
+template <typename T>
+inline void printArray(T* array,
+                       const size_t nPrintedCols,
+                       const size_t nPrintedRows,
+                       const size_t nCols,
+                       std::string message,
+                       size_t interval = 10) {
+  std::cout << std::setiosflags(std::ios::left);
+  std::cout << message << std::endl;
+  for (size_t i = 0; i < nPrintedRows; i++) {
+    for (size_t j = 0; j < nPrintedCols; j++) {
+      std::cout << std::setw(interval) << std::setiosflags(std::ios::fixed)
+                << std::setprecision(3);
+      std::cout << array[i * nCols + j];
+    }
+    std::cout << std::endl;
+  }
+  std::cout << std::endl;
+}
+
+template <typename T>
+inline void printArray(T* array,
+                       const size_t nCols,
+                       const size_t nRows,
+                       std::string message,
+                       size_t interval = 10) {
+  printArray(array, nCols, nRows, nCols, message, interval);
+}
+
+template <typename T>
+inline void printLowerArray(T* array,
+                            const size_t nPrintedRows,
+                            std::string message,
+                            size_t interval = 10) {
+  std::cout << std::setiosflags(std::ios::left);
+  std::cout << message << std::endl;
+  int ind = 0;
+  for (size_t i = 0; i < nPrintedRows; i++) {
+    for (size_t j = 0; j <= i; j++) {
+      std::cout << std::setw(interval) << std::setiosflags(std::ios::fixed)
+                << std::setprecision(3);
+      std::cout << array[ind++];
+    }
+    std::cout << std::endl;
+  }
+  std::cout << std::endl;
+}
+
+template <typename T>
+inline void printUpperArray(T* array,
+                            const size_t nPrintedCols,
+                            const size_t nPrintedRows,
+                            const size_t nCols,
+                            std::string message,
+                            size_t interval = 10) {
+  std::cout << std::setiosflags(std::ios::left);
+  std::cout << message << std::endl;
+  int ind = 0;
+  for (size_t i = 0; i < nPrintedRows; i++) {
+    for (size_t j = 0; j < i; j++) {
+      std::cout << "          ";
+    }
+    for (size_t j = i; j < nPrintedCols; j++) {
+      std::cout << std::setw(interval) << std::setiosflags(std::ios::fixed)
+                << std::setprecision(3);
+      std::cout << array[ind++];
+    }
+    for (size_t j = nPrintedCols; j < nCols; j++) {
+      ind++;
+    }
+    std::cout << std::endl;
+  }
+  std::cout << std::endl;
+}
+
+inline void printNumericTable(daal::data_management::NumericTable* dataTable,
+                              const char* message = "",
+                              size_t nPrintedRows = 0,
+                              size_t nPrintedCols = 0,
+                              size_t interval = 10) {
+  using namespace daal::data_management;
+
+  size_t nRows = dataTable->getNumberOfRows();
+  size_t nCols = dataTable->getNumberOfColumns();
+  NumericTableIface::StorageLayout layout = dataTable->getDataLayout();
+
+  if (nPrintedRows != 0) {
+    nPrintedRows = std::min(nRows, nPrintedRows);
+  } else {
+    nPrintedRows = nRows;
+  }
+
+  if (nPrintedCols != 0) {
+    nPrintedCols = std::min(nCols, nPrintedCols);
+  } else {
+    nPrintedCols = nCols;
+  }
+
+  BlockDescriptor<DAAL_DATA_TYPE> block;
+  if (isFull(layout) || layout == NumericTableIface::csrArray) {
+    dataTable->getBlockOfRows(0, nRows, readOnly, block);
+    printArray<DAAL_DATA_TYPE>(
+        block.getBlockPtr(), nPrintedCols, nPrintedRows, nCols, message, interval);
+    dataTable->releaseBlockOfRows(block);
+  } else {
+    PackedArrayNumericTableIface* packedTable =
+        dynamic_cast<PackedArrayNumericTableIface*>(dataTable);
+    packedTable->getPackedArray(readOnly, block);
+    if (isLower(layout)) {
+      printLowerArray<DAAL_DATA_TYPE>(
+          block.getBlockPtr(), nPrintedRows, message, interval);
+    } else if (isUpper(layout)) {
+      printUpperArray<DAAL_DATA_TYPE>(
+          block.getBlockPtr(), nPrintedCols, nPrintedRows, nCols, message, interval);
+    }
+    packedTable->releasePackedArray(block);
+  }
+}
+
+inline void printNumericTable(daal::data_management::NumericTable& dataTable,
+                              const char* message = "",
+                              size_t nPrintedRows = 0,
+                              size_t nPrintedCols = 0,
+                              size_t interval = 10) {
+  printNumericTable(&dataTable, message, nPrintedRows, nPrintedCols, interval);
+}
+
+inline void printNumericTable(const daal::data_management::NumericTablePtr& dataTable,
+                              const char* message = "",
+                              size_t nPrintedRows = 0,
+                              size_t nPrintedCols = 0,
+                              size_t interval = 10) {
+  printNumericTable(dataTable.get(), message, nPrintedRows, nPrintedCols, interval);
+}
+
 template <typename T>
 const NumericTablePtr prepare_data_table(const T* data, const int64_t num_rows) {
   // Prepare input data as structure of arrays (SOA) as columnar format (zero-copy)
   const auto data_table = SOANumericTable::create(1 /* num_columns */, num_rows);
   data_table->setArray<T>(const_cast<T*>(data), 0);
+
   return data_table;
 }
 
@@ -237,6 +538,7 @@ onedal_linear_reg_fit_impl(const T* input_labels,
     const auto training_result = algorithm.getResult();
     const auto coefs_table =
         training_result->get(linear_regression::training::model)->getBeta();
+
     return extract_model_coefs<T>(coefs_table, output_coef_idxs, output_coefs);
   } catch (std::exception& e) {
     throw std::runtime_error(e.what());
@@ -256,6 +558,7 @@ NEVER_INLINE HOST linear_regression::ModelPtr build_linear_reg_model(
   }
   const auto betas_table =
       prepare_pivoted_data_table(casted_model_coefs.data(), num_coefs);
+
   CHECK_EQ(betas_table->getNumberOfColumns(), num_coefs);
 
   // Create model builder with true intercept flag
@@ -588,16 +891,12 @@ NEVER_INLINE HOST int32_t onedal_random_forest_reg_predict_impl(
     result->set(decision_forest::regression::prediction::prediction, predictions_table);
     algorithm.setResult(result);
     algorithm.compute();
+
     return num_rows;
   } catch (std::exception& e) {
     throw std::runtime_error(e.what());
   }
 }
 
-inline const std::vector<double>& onedal_random_forest_reg_var_importance_impl(
-    const std::shared_ptr<RandomForestRegressionModel>& rand_forest_model) {
-  return rand_forest_model->getVariableImportanceScores();
-}
-
 #endif  // #ifdef HAVE_ONEDAL
 #endif  // #ifdef __CUDACC__
diff --git a/QueryEngine/TableFunctions/SystemFunctions/os/MLTableFunctions.cpp b/QueryEngine/TableFunctions/SystemFunctions/os/MLTableFunctions.cpp
index 8b246a08d4..b81465c3d9 100644
--- a/QueryEngine/TableFunctions/SystemFunctions/os/MLTableFunctions.cpp
+++ b/QueryEngine/TableFunctions/SystemFunctions/os/MLTableFunctions.cpp
@@ -9,7 +9,7 @@ int32_t supported_ml_frameworks__cpu_(TableFunctionManager& mgr,
                                       Column<TextEncodingDict>& output_ml_frameworks,
                                       Column<bool>& output_availability,
                                       Column<bool>& output_default) {
-  const std::vector<std::string> ml_frameworks = {"onedal", "mlpack"};
+  const std::vector<std::string> ml_frameworks = {"oneapi", "onedal", "mlpack"};
   const int32_t num_frameworks = ml_frameworks.size();
   mgr.set_output_row_size(num_frameworks);
   const std::vector<int32_t> ml_framework_string_ids =
@@ -40,7 +40,8 @@ int32_t supported_ml_frameworks__cpu_(TableFunctionManager& mgr,
 
   for (int32_t out_row_idx = 0; out_row_idx < num_frameworks; ++out_row_idx) {
     output_ml_frameworks[out_row_idx] = ml_framework_string_ids[out_row_idx];
-    if (ml_frameworks[out_row_idx] == "onedal") {
+    if (ml_frameworks[out_row_idx] == "onedal" ||
+        ml_frameworks[out_row_idx] == "oneapi") {
 #ifdef HAVE_ONEDAL
       framework_found_actions(out_row_idx);
 #else
@@ -185,12 +186,12 @@ random_forest_reg_var_importance__cpu_1(TableFunctionManager& mgr,
 #ifdef HAVE_ONEDAL
     const auto base_model = g_ml_models.getModel(model_name);
     const auto rand_forest_model =
-        std::dynamic_pointer_cast<RandomForestRegressionModel>(base_model);
+        std::dynamic_pointer_cast<AbstractRandomForestModel>(base_model);
     if (!rand_forest_model) {
       throw std::runtime_error("Model is not of type random forest.");
     }
     const auto& variable_importance_scores =
-        onedal_random_forest_reg_var_importance_impl(rand_forest_model);
+        rand_forest_model->getVariableImportanceScores();
     const int64_t num_features = variable_importance_scores.size();
     mgr.set_output_row_size(num_features);
     if (num_features == 0) {
diff --git a/QueryEngine/TableFunctions/SystemFunctions/os/MLTableFunctions.hpp b/QueryEngine/TableFunctions/SystemFunctions/os/MLTableFunctions.hpp
index 6edc32bf50..4bcdf63e29 100644
--- a/QueryEngine/TableFunctions/SystemFunctions/os/MLTableFunctions.hpp
+++ b/QueryEngine/TableFunctions/SystemFunctions/os/MLTableFunctions.hpp
@@ -26,6 +26,7 @@
 #include "QueryEngine/TableFunctions/SystemFunctions/os/ML/OneHotEncoder.h"
 
 #ifdef HAVE_ONEDAL
+#include "QueryEngine/TableFunctions/SystemFunctions/os/ML/OneAPIFunctions.hpp"
 #include "QueryEngine/TableFunctions/SystemFunctions/os/ML/OneDalFunctions.hpp"
 #endif
 
@@ -112,7 +113,7 @@ kmeans__cpu_template(TableFunctionManager& mgr,
   output_ids = input_ids;
   const auto kmeans_init_strategy = get_kmeans_init_type(init_type_str);
   if (kmeans_init_strategy == KMeansInitStrategy::INVALID) {
-    return mgr.ERROR_MESSAGE("Invalid KMeans initializaiton strategy: " +
+    return mgr.ERROR_MESSAGE("Invalid KMeans initialization strategy: " +
                              init_type_str.getString());
   }
 
@@ -137,8 +138,16 @@ kmeans__cpu_template(TableFunctionManager& mgr,
 
     bool did_execute = false;
 #ifdef HAVE_ONEDAL
-    if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
+    if (!did_execute && (preferred_ml_framework == MLFramework::ONEAPI ||
                          preferred_ml_framework == MLFramework::DEFAULT)) {
+      onedal_oneapi_kmeans_impl(normalized_ptrs,
+                                denulled_output,
+                                num_rows,
+                                num_clusters,
+                                num_iterations,
+                                kmeans_init_strategy);
+      did_execute = true;
+    } else if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL)) {
       onedal_kmeans_impl(normalized_ptrs,
                          denulled_output,
                          num_rows,
@@ -224,8 +233,12 @@ dbscan__cpu_template(TableFunctionManager& mgr,
 
     bool did_execute = false;
 #ifdef HAVE_ONEDAL
-    if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
+    if (!did_execute && (preferred_ml_framework == MLFramework::ONEAPI ||
                          preferred_ml_framework == MLFramework::DEFAULT)) {
+      onedal_oneapi_dbscan_impl(
+          normalized_ptrs, denulled_output, num_rows, epsilon, min_observations);
+      did_execute = true;
+    } else if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL)) {
       onedal_dbscan_impl(
           normalized_ptrs, denulled_output, num_rows, epsilon, min_observations);
       did_execute = true;
@@ -287,6 +300,10 @@ linear_reg_fit_impl(TableFunctionManager& mgr,
   try {
     bool did_execute = false;
 #ifdef HAVE_ONEDAL
+    // FIXME: We default to legacy DAAL Linear Regression, as the oneAPI implementation
+    // seems to be experimental. It crashes on a few small toy models (such as datasets
+    // with 1 datapoint) and finds different coefficients for large models, when compared
+    // with the DAAL implementation. This should be revisited when oneDAL is updated.
     if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
                          preferred_ml_framework == MLFramework::DEFAULT)) {
       onedal_linear_reg_fit_impl(labels_ptrs[0],
@@ -295,6 +312,13 @@ linear_reg_fit_impl(TableFunctionManager& mgr,
                                  coefs.data(),
                                  denulled_data.masked_num_rows);
       did_execute = true;
+    } else if (!did_execute && (preferred_ml_framework == MLFramework::ONEAPI)) {
+      onedal_oneapi_linear_reg_fit_impl(labels_ptrs[0],
+                                        features_ptrs,
+                                        coef_idxs.data(),
+                                        coefs.data(),
+                                        denulled_data.masked_num_rows);
+      did_execute = true;
     }
 #endif
 #ifdef HAVE_MLPACK
@@ -1132,8 +1156,58 @@ random_forest_reg_fit_impl(TableFunctionManager& mgr,
                                var_importance_metric_str.getString());
     }
 #ifdef HAVE_ONEDAL
-    if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
+    if (!did_execute && (preferred_ml_framework == MLFramework::ONEAPI ||
                          preferred_ml_framework == MLFramework::DEFAULT)) {
+      if (use_histogram) {
+        onedal_oneapi_random_forest_reg_fit_impl<
+            T,
+            oneapi::dal::decision_forest::method::hist>(
+            model_name,
+            labels_ptrs[0],
+            features_ptrs,
+            model_metadata,
+            cat_feature_keys,
+            denulled_data.masked_num_rows,
+            num_trees,
+            obs_per_tree_fraction,
+            max_tree_depth,
+            features_per_node,
+            impurity_threshold,
+            bootstrap,
+            min_obs_per_leaf_node,
+            min_obs_per_split_node,
+            min_weight_fraction_in_leaf_node,
+            min_impurity_decrease_in_split_node,
+            max_leaf_nodes,
+            var_importance_metric);
+      } else {
+        onedal_oneapi_random_forest_reg_fit_impl<
+            T,
+            oneapi::dal::decision_forest::method::dense>(
+            model_name,
+            labels_ptrs[0],
+            features_ptrs,
+            model_metadata,
+            cat_feature_keys,
+            denulled_data.masked_num_rows,
+            num_trees,
+            obs_per_tree_fraction,
+            max_tree_depth,
+            features_per_node,
+            impurity_threshold,
+            bootstrap,
+            min_obs_per_leaf_node,
+            min_obs_per_split_node,
+            min_weight_fraction_in_leaf_node,
+            min_impurity_decrease_in_split_node,
+            max_leaf_nodes,
+            var_importance_metric);
+      }
+      const TextEncodingDict model_name_str_id =
+          output_model_name.getOrAddTransient(model_name);
+      output_model_name[0] = model_name_str_id;
+      did_execute = true;
+    } else if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL)) {
       if (use_histogram) {
         onedal_random_forest_reg_fit_impl<T, decision_forest::regression::training::hist>(
             model_name,
@@ -1456,8 +1530,19 @@ pca_fit_impl(TableFunctionManager& mgr,
                    z_std_norm_summary_stats.normalized_data.size());
     bool did_execute = false;
 #ifdef HAVE_ONEDAL
-    if (preferred_ml_framework == MLFramework::ONEDAL ||
+    if (preferred_ml_framework == MLFramework::ONEAPI ||
         preferred_ml_framework == MLFramework::DEFAULT) {
+      const auto [eigenvectors, eigenvalues] =
+          onedal_oneapi_pca_impl(normalized_ptrs, denulled_data.masked_num_rows);
+      auto model = std::make_shared<PcaModel>(z_std_norm_summary_stats.means,
+                                              z_std_norm_summary_stats.std_devs,
+                                              eigenvectors,
+                                              eigenvalues,
+                                              model_metadata,
+                                              cat_feature_keys);
+      g_ml_models.addModel(model_name, model);
+      did_execute = true;
+    } else if (preferred_ml_framework == MLFramework::ONEDAL) {
       const auto [eigenvectors, eigenvalues] =
           onedal_pca_impl(normalized_ptrs, denulled_data.masked_num_rows);
       auto model = std::make_shared<PcaModel>(z_std_norm_summary_stats.means,
@@ -1607,8 +1692,12 @@ ml_reg_predict_impl(TableFunctionManager& mgr,
             std::dynamic_pointer_cast<LinearRegressionModel>(model);
         CHECK(linear_reg_model);
 #ifdef HAVE_ONEDAL
-        if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
+        if (!did_execute && (preferred_ml_framework == MLFramework::ONEAPI ||
                              preferred_ml_framework == MLFramework::DEFAULT)) {
+          onedal_oneapi_linear_reg_predict_impl(
+              linear_reg_model, features_ptrs, denulled_output, num_rows);
+          did_execute = true;
+        } else if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL)) {
           onedal_linear_reg_predict_impl(
               linear_reg_model, features_ptrs, denulled_output, num_rows);
           did_execute = true;
@@ -1655,11 +1744,19 @@ ml_reg_predict_impl(TableFunctionManager& mgr,
 #ifdef HAVE_ONEDAL
         const auto random_forest_reg_model =
             std::dynamic_pointer_cast<RandomForestRegressionModel>(model);
-        CHECK(random_forest_reg_model);
-        if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
+        const auto oneapi_random_forest_reg_model =
+            std::dynamic_pointer_cast<OneAPIRandomForestRegressionModel>(model);
+        CHECK(random_forest_reg_model || oneapi_random_forest_reg_model);
+        if (!did_execute && (preferred_ml_framework == MLFramework::ONEAPI ||
+                             preferred_ml_framework == MLFramework::ONEDAL ||
                              preferred_ml_framework == MLFramework::DEFAULT)) {
-          onedal_random_forest_reg_predict_impl(
-              random_forest_reg_model, features_ptrs, denulled_output, num_rows);
+          if (random_forest_reg_model) {
+            onedal_random_forest_reg_predict_impl(
+                random_forest_reg_model, features_ptrs, denulled_output, num_rows);
+          } else {
+            onedal_oneapi_random_forest_reg_predict_impl(
+                oneapi_random_forest_reg_model, features_ptrs, denulled_output, num_rows);
+          }
           did_execute = true;
         }
 #endif
diff --git a/QueryEngine/TableFunctions/SystemFunctions/os/Shared/TableFunctionsStats.cpp b/QueryEngine/TableFunctions/SystemFunctions/os/Shared/TableFunctionsStats.cpp
index 2903f1cfd2..fef21a6963 100644
--- a/QueryEngine/TableFunctions/SystemFunctions/os/Shared/TableFunctionsStats.cpp
+++ b/QueryEngine/TableFunctions/SystemFunctions/os/Shared/TableFunctionsStats.cpp
@@ -35,42 +35,48 @@ NEVER_INLINE HOST ColumnStats<T> get_column_stats(
   std::vector<int64_t> local_col_non_null_or_filtered_counts(num_threads, 0L);
   tbb::task_arena limited_arena(num_threads);
   limited_arena.execute([&] {
-    tbb::parallel_for(tbb::blocked_range<int64_t>(0, num_rows),
-                      [&](const tbb::blocked_range<int64_t>& r) {
-                        const int64_t start_idx = r.begin();
-                        const int64_t end_idx = r.end();
-                        T local_col_min = std::numeric_limits<T>::max();
-                        T local_col_max = std::numeric_limits<T>::lowest();
-                        double local_col_sum = 0.;
-                        int64_t local_col_non_null_or_filtered_count = 0;
-                        for (int64_t r = start_idx; r < end_idx; ++r) {
-                          const T val = data[r];
-                          if (val == inline_null_value<T>()) {
-                            continue;
-                          }
-                          if (!predicate(val)) {
-                            continue;
-                          }
-                          if (val < local_col_min) {
-                            local_col_min = val;
-                          }
-                          if (val > local_col_max) {
-                            local_col_max = val;
-                          }
-                          local_col_sum += data[r];
-                          local_col_non_null_or_filtered_count++;
-                        }
-                        size_t thread_idx = tbb::this_task_arena::current_thread_index();
-                        if (local_col_min < local_col_mins[thread_idx]) {
-                          local_col_mins[thread_idx] = local_col_min;
-                        }
-                        if (local_col_max > local_col_maxes[thread_idx]) {
-                          local_col_maxes[thread_idx] = local_col_max;
-                        }
-                        local_col_sums[thread_idx] += local_col_sum;
-                        local_col_non_null_or_filtered_counts[thread_idx] +=
-                            local_col_non_null_or_filtered_count;
-                      });
+    tbb::parallel_for(
+        tbb::blocked_range<int64_t>(0, num_rows),
+        [&](const tbb::blocked_range<int64_t>& r) {
+          const int64_t start_idx = r.begin();
+          const int64_t end_idx = r.end();
+          T local_col_min = std::numeric_limits<T>::max();
+          T local_col_max = std::numeric_limits<T>::lowest();
+          double local_col_sum = 0.;
+          int64_t local_col_non_null_or_filtered_count = 0;
+          for (int64_t r = start_idx; r < end_idx; ++r) {
+            const T val = data[r];
+            if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
+              if (std::isnan(val) || std::isinf(val)) {
+                continue;
+              }
+            }
+            if (val == inline_null_value<T>()) {
+              continue;
+            }
+            if (!predicate(val)) {
+              continue;
+            }
+            if (val < local_col_min) {
+              local_col_min = val;
+            }
+            if (val > local_col_max) {
+              local_col_max = val;
+            }
+            local_col_sum += data[r];
+            local_col_non_null_or_filtered_count++;
+          }
+          size_t thread_idx = tbb::this_task_arena::current_thread_index();
+          if (local_col_min < local_col_mins[thread_idx]) {
+            local_col_mins[thread_idx] = local_col_min;
+          }
+          if (local_col_max > local_col_maxes[thread_idx]) {
+            local_col_maxes[thread_idx] = local_col_max;
+          }
+          local_col_sums[thread_idx] += local_col_sum;
+          local_col_non_null_or_filtered_counts[thread_idx] +=
+              local_col_non_null_or_filtered_count;
+        });
   });
 
   ColumnStats<T> column_stats;
diff --git a/QueryEngine/TableFunctions/TableFunctionExecutionContext.cpp b/QueryEngine/TableFunctions/TableFunctionExecutionContext.cpp
index b74dcc171e..e4b075160f 100644
--- a/QueryEngine/TableFunctions/TableFunctionExecutionContext.cpp
+++ b/QueryEngine/TableFunctions/TableFunctionExecutionContext.cpp
@@ -131,6 +131,7 @@ ResultSetPtr TableFunctionExecutionContext::execute(
   // arguments are not supported on GPU atm.
   std::vector<std::vector<const int8_t*>> col_list_bufs;
   std::vector<std::vector<const int8_t*>> input_col_list_str_dict_proxy_ptrs;
+
   for (const auto& input_expr : exe_unit.input_exprs) {
     auto ti = input_expr->get_type_info();
     if (!ti.is_column_list()) {
diff --git a/QueryEngine/TableOptimizer.cpp b/QueryEngine/TableOptimizer.cpp
index 3bd6f7e51a..8da7e44d72 100644
--- a/QueryEngine/TableOptimizer.cpp
+++ b/QueryEngine/TableOptimizer.cpp
@@ -155,8 +155,8 @@ void TableOptimizer::recomputeMetadata() const {
 
   for (const auto td : table_descriptors) {
     ScopeGuard row_set_holder = [this] { executor_->row_set_mem_owner_ = nullptr; };
-    executor_->row_set_mem_owner_ = std::make_shared<RowSetMemoryOwner>(
-        ROW_SET_SIZE, executor_->executor_id_, /*num_threads=*/1);
+    executor_->row_set_mem_owner_ =
+        std::make_shared<RowSetMemoryOwner>(ROW_SET_SIZE, executor_->executor_id_);
     const auto table_id = td->tableId;
     auto stats = recomputeDeletedColumnMetadata(td);
 
@@ -563,8 +563,8 @@ void TableOptimizer::vacuumFragmentsAboveMinSelectivity(
       heavyai::unique_lock<heavyai::shared_mutex> executor_lock(
           executor_->execute_mutex_);
       ScopeGuard row_set_holder = [this] { executor_->row_set_mem_owner_ = nullptr; };
-      executor_->row_set_mem_owner_ = std::make_shared<RowSetMemoryOwner>(
-          ROW_SET_SIZE, executor_->executor_id_, /*num_threads=*/1);
+      executor_->row_set_mem_owner_ =
+          std::make_shared<RowSetMemoryOwner>(ROW_SET_SIZE, executor_->executor_id_);
       deleted_column_stats =
           getDeletedColumnStats(td, getFragmentIndexes(td, fragment_ids));
       executor_->clearMetaInfoCache();
diff --git a/QueryEngine/ThriftSerializers.h b/QueryEngine/ThriftSerializers.h
index bb1a802a3c..6bd480982d 100644
--- a/QueryEngine/ThriftSerializers.h
+++ b/QueryEngine/ThriftSerializers.h
@@ -29,20 +29,22 @@
 #include "QueryEngine/AggregatedColRange.h"
 #include "QueryEngine/CompilationOptions.h"
 #include "QueryEngine/Descriptors/CountDistinctDescriptor.h"
-#include "QueryEngine/Descriptors/Types.h"
 #include "QueryEngine/ExtensionFunctionsWhitelist.h"
 #include "QueryEngine/StringDictionaryGenerations.h"
 #include "QueryEngine/TableFunctions/TableFunctionsFactory.h"
 #include "QueryEngine/TargetMetaInfo.h"
+#include "Shared/TargetInfo.h"
 #include "Shared/ThriftTypesConvert.h"
+#include "enums.h"
 
 namespace ThriftSerializers {
 
-#define THRIFT_LAYOUT_CASE(layout)   \
-  case QueryDescriptionType::layout: \
+#define THRIFT_LAYOUT_CASE(layout)            \
+  case heavyai::QueryDescriptionType::layout: \
     return TResultSetLayout::layout;
 
-inline TResultSetLayout::type layout_to_thrift(const QueryDescriptionType layout) {
+inline TResultSetLayout::type layout_to_thrift(
+    const heavyai::QueryDescriptionType layout) {
   switch (layout) {
     THRIFT_LAYOUT_CASE(GroupByPerfectHash)
     THRIFT_LAYOUT_CASE(GroupByBaselineHash)
@@ -58,9 +60,10 @@ inline TResultSetLayout::type layout_to_thrift(const QueryDescriptionType layout
 
 #define UNTHRIFT_LAYOUT_CASE(layout) \
   case TResultSetLayout::layout:     \
-    return QueryDescriptionType::layout;
+    return heavyai::QueryDescriptionType::layout;
 
-inline QueryDescriptionType layout_from_thrift(const TResultSetLayout::type layout) {
+inline heavyai::QueryDescriptionType layout_from_thrift(
+    const TResultSetLayout::type layout) {
   switch (layout) {
     UNTHRIFT_LAYOUT_CASE(GroupByPerfectHash)
     UNTHRIFT_LAYOUT_CASE(GroupByBaselineHash)
diff --git a/QueryEngine/WindowContext.cpp b/QueryEngine/WindowContext.cpp
index 8e67cec952..b909fcacc0 100644
--- a/QueryEngine/WindowContext.cpp
+++ b/QueryEngine/WindowContext.cpp
@@ -1444,8 +1444,9 @@ void WindowFunctionContext::computePartitionBuffer(
       break;
     }
     default: {
-      throw std::runtime_error("Window function not supported yet: " +
-                               ::toString(window_func->getKind()));
+      std::ostringstream oss;
+      oss << "Window function not supported yet: " << window_func_->getKind();
+      throw std::runtime_error(oss.str());
     }
   }
 }
diff --git a/QueryEngine/WindowContext.h b/QueryEngine/WindowContext.h
index f9dde393ca..dd971648c9 100644
--- a/QueryEngine/WindowContext.h
+++ b/QueryEngine/WindowContext.h
@@ -33,6 +33,24 @@ inline bool window_function_is_value(const SqlWindowFunctionKind kind) {
     case SqlWindowFunctionKind::FIRST_VALUE:
     case SqlWindowFunctionKind::LAST_VALUE:
     case SqlWindowFunctionKind::NTH_VALUE:
+    case SqlWindowFunctionKind::LAG_IN_FRAME:
+    case SqlWindowFunctionKind::LEAD_IN_FRAME:
+    case SqlWindowFunctionKind::FIRST_VALUE_IN_FRAME:
+    case SqlWindowFunctionKind::LAST_VALUE_IN_FRAME:
+    case SqlWindowFunctionKind::NTH_VALUE_IN_FRAME:
+      return true;
+    default:
+      return false;
+  }
+}
+
+inline bool window_function_is_value_with_frame(const SqlWindowFunctionKind kind) {
+  switch (kind) {
+    case SqlWindowFunctionKind::LAG_IN_FRAME:
+    case SqlWindowFunctionKind::LEAD_IN_FRAME:
+    case SqlWindowFunctionKind::FIRST_VALUE_IN_FRAME:
+    case SqlWindowFunctionKind::LAST_VALUE_IN_FRAME:
+    case SqlWindowFunctionKind::NTH_VALUE_IN_FRAME:
       return true;
     default:
       return false;
diff --git a/QueryEngine/WindowFunctionIR.cpp b/QueryEngine/WindowFunctionIR.cpp
index 2f3a58d51b..dec6c123e8 100644
--- a/QueryEngine/WindowFunctionIR.cpp
+++ b/QueryEngine/WindowFunctionIR.cpp
@@ -182,8 +182,14 @@ llvm::Value* get_null_value_by_size(CgenState* cgen_state, SQLTypeInfo col_ti) {
       case kTINYINT:
         return cgen_state->llInt((int8_t)inline_int_null_value<int8_t>());
       case kSMALLINT:
+        if (col_ti.get_compression() == kENCODING_FIXED) {
+          return cgen_state->llInt((int16_t)(inline_fixed_encoding_null_val(col_ti)));
+        }
         return cgen_state->llInt((int16_t)inline_int_null_value<int16_t>());
       case kINT:
+        if (col_ti.get_compression() == kENCODING_FIXED) {
+          return cgen_state->llInt((int32_t)(inline_fixed_encoding_null_val(col_ti)));
+        }
         return cgen_state->llInt((int32_t)inline_int_null_value<int32_t>());
       case kTIME:
       case kTIMESTAMP:
@@ -191,6 +197,10 @@ llvm::Value* get_null_value_by_size(CgenState* cgen_state, SQLTypeInfo col_ti) {
           return cgen_state->llInt((int64_t)(inline_fixed_encoding_null_val(col_ti)));
         }
       case kBIGINT:
+        if (col_ti.get_compression() == kENCODING_FIXED) {
+          return cgen_state->llInt((int64_t)(inline_fixed_encoding_null_val(col_ti)));
+        }
+        return cgen_state->llInt((int64_t)inline_int_null_value<int64_t>());
       case kINTERVAL_DAY_TIME:
       case kINTERVAL_YEAR_MONTH:
       case kDECIMAL:
diff --git a/QueryEngine/enums.h b/QueryEngine/enums.h
new file mode 100644
index 0000000000..0e607e19d3
--- /dev/null
+++ b/QueryEngine/enums.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright 2023 HEAVY.AI, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file    enums.h
+ * @brief   QueryEngine enum classes with minimal #include files.
+ */
+
+#pragma once
+
+#include "Shared/define_enum_class.h"
+
+namespace heavyai {
+
+HEAVYAI_DEFINE_ENUM_CLASS_WITH_DESCRIPTIONS(
+    ErrorCode,
+    (NO_ERROR, "No Error"),
+    (DIV_BY_ZERO, "Division by zero"),
+    (OUT_OF_GPU_MEM,
+     "Query couldn't keep the entire working set of columns in GPU memory"),
+    (OUT_OF_SLOTS, "Out of Slots"),
+    (UNSUPPORTED_SELF_JOIN, "Self joins not supported yet"),
+    (OUT_OF_RENDER_MEM,
+     "Insufficient GPU memory for query results in render output buffer sized by "
+     "render-mem-bytes"),
+    (OUT_OF_CPU_MEM, "Not enough host memory to execute the query"),
+    (OVERFLOW_OR_UNDERFLOW, "Overflow or underflow"),
+    (OUT_OF_TIME, "Query execution has exceeded the time limit"),
+    (INTERRUPTED, "Query execution has been interrupted"),
+    (COLUMNAR_CONVERSION_NOT_SUPPORTED,
+     "Columnar conversion not supported for variable length types"),
+    (TOO_MANY_LITERALS, "Too many literals in the query"),
+    (STRING_CONST_IN_RESULTSET,
+     "NONE ENCODED String types are not supported as input result set."),
+    (STREAMING_TOP_N_NOT_SUPPORTED_IN_RENDER_QUERY,
+     "Streaming-Top-N not supported in Render Query"),
+    (SINGLE_VALUE_FOUND_MULTIPLE_VALUES, "Multiple distinct values encountered"),
+    (GEOS, "Geo-related error"),
+    (WIDTH_BUCKET_INVALID_ARGUMENT,
+     "Arguments of WIDTH_BUCKET function does not satisfy the condition"),
+    (BBOX_OVERLAPS_LIMIT_EXCEEDED,
+     "Maximum supported number of bounding box overlaps exceeded"))
+
+HEAVYAI_DEFINE_ENUM_CLASS(QueryDescriptionType,
+                          GroupByPerfectHash,
+                          GroupByBaselineHash,
+                          Projection,
+                          TableFunction,
+                          NonGroupedAggregate,
+                          Estimator)
+
+}  // namespace heavyai
diff --git a/QueryRunner/QueryRunner.cpp b/QueryRunner/QueryRunner.cpp
index 68664f7cb6..8ddf688417 100644
--- a/QueryRunner/QueryRunner.cpp
+++ b/QueryRunner/QueryRunner.cpp
@@ -652,7 +652,6 @@ std::shared_ptr<ResultSet> QueryRunner::runSQL(const std::string& query_str,
     return nullptr;
   }
   const auto execution_result = runSelectQuery(query_str, std::move(co), std::move(eo));
-  VLOG(1) << session_info_->getCatalog().getDataMgr().getSystemMemoryUsage();
   return execution_result->getRows();
 }
 
diff --git a/Shared/Datum.h b/Shared/Datum.h
index 91c32660d5..c837c31658 100644
--- a/Shared/Datum.h
+++ b/Shared/Datum.h
@@ -25,6 +25,8 @@
 
 #include "funcannotations.h"
 
+#include <type_traits>
+
 #ifndef __CUDACC__
 #include <string_view>
 #endif
@@ -79,3 +81,41 @@ union Datum {
   std::string* stringval;  // string value
 #endif
 };
+
+template <typename T>
+Datum make_datum(T val) {
+  static_assert(std::is_same_v<T, bool> || std::is_same_v<T, int8_t> ||
+                    std::is_same_v<T, int16_t> || std::is_same_v<T, int32_t> ||
+                    std::is_same_v<T, int64_t> || std::is_same_v<T, float> ||
+                    std::is_same_v<T, double> || std::is_same_v<T, VarlenDatum*>
+#ifndef __CUDACC__
+                    || std::is_same_v<T, std::string*>
+#endif
+                ,
+                "Type T must be one of the allowed types");
+  Datum d;
+  if constexpr (std::is_same_v<T, bool>) {
+    d.boolval = static_cast<int8_t>(val);
+  } else if constexpr (std::is_same_v<T, int8_t>) {
+    d.tinyintval = val;
+  } else if constexpr (std::is_same_v<T, int16_t>) {
+    d.smallintval = val;
+  } else if constexpr (std::is_same_v<T, int32_t>) {
+    d.intval = val;
+  } else if constexpr (std::is_same_v<T, int64_t>) {
+    d.bigintval = val;
+  } else if constexpr (std::is_same_v<T, float>) {
+    d.floatval = val;
+  } else if constexpr (std::is_same_v<T, double>) {
+    d.doubleval = val;
+  } else if constexpr (std::is_same_v<T, VarlenDatum*>) {
+    // deleting `arrayval` is caller's responsibility
+    d.arrayval = val;
+#ifndef __CUDACC__
+  } else if constexpr (std::is_same_v<T, std::string*>) {
+    // deleting `stringval` is caller's responsibility
+    d.stringval = val;
+#endif
+  }
+  return d;
+}
diff --git a/Shared/SimpleAllocator.h b/Shared/SimpleAllocator.h
index 9d0a6acb9b..cf61c1519a 100644
--- a/Shared/SimpleAllocator.h
+++ b/Shared/SimpleAllocator.h
@@ -17,9 +17,7 @@
 #pragma once
 
 class SimpleAllocator {
- protected:
-  ~SimpleAllocator() = default;
-
  public:
-  virtual int8_t* allocate(const size_t num_bytes, const size_t thread_idx = 0) = 0;
+  virtual ~SimpleAllocator() = default;
+  virtual int8_t* allocate(const size_t num_bytes) = 0;
 };
diff --git a/Shared/StringTransform.cpp b/Shared/StringTransform.cpp
index 22784948b8..9bb3176fa6 100644
--- a/Shared/StringTransform.cpp
+++ b/Shared/StringTransform.cpp
@@ -17,16 +17,11 @@
 #include "StringTransform.h"
 #include "Logger/Logger.h"
 
-#include <boost/algorithm/string/classification.hpp>  // Include boost::for is_any_of
-#include <boost/algorithm/string/split.hpp>           // Include for boost::split
-
 #include <numeric>
 #include <random>
 #include <regex>
-#include <string>
 
-#include <cmath>   // format_bytes round call
-#include <vector>  // format_bytes
+#include <cmath>  // format_bytes round call
 
 #ifndef __CUDACC__
 #include <boost/filesystem.hpp>
@@ -37,7 +32,7 @@ void apply_shim(std::string& result,
                 const boost::regex& reg_expr,
                 const std::function<void(std::string&, const boost::smatch&)>& shim_fn) {
   boost::smatch what;
-  std::vector<std::pair<size_t, size_t>> lit_pos = find_string_literals(result);
+  auto lit_pos = find_string_literals(result);
   auto start_it = result.cbegin();
   auto end_it = result.cend();
   while (true) {
@@ -57,8 +52,10 @@ void apply_shim(std::string& result,
   }
 }
 
+// Scan query and save all single-quoted string literals as [begin,end) index pairs into
+// lit_pos, including the surrounding quotes.
 std::vector<std::pair<size_t, size_t>> find_string_literals(const std::string& query) {
-  boost::regex literal_string_regex{R"(([^']+)('(?:[^']+|'')+'))", boost::regex::perl};
+  boost::regex literal_string_regex{R"(([^']+)('(?:[^']+|'')*'))", boost::regex::perl};
   boost::smatch what;
   auto it = query.begin();
   auto prev_it = it;
@@ -69,13 +66,14 @@ std::vector<std::pair<size_t, size_t>> find_string_literals(const std::string& q
         break;
       }
     } catch (const std::exception& e) {
-      LOG(WARNING) << "Error processing literals: " << e.what()
-                   << "\nContinuing query parse...";
       // boost::regex throws an exception about the complexity of matching when
       // the wrong type of quotes are used or they're mismatched. Let the query
       // through unmodified, the parser will throw a much more informative error.
       // This can also throw on very long queries
-      break;
+      std::ostringstream oss;
+      oss << "Detecting an error while processing string literal regex search: "
+          << e.what();
+      throw std::runtime_error(oss.str());
     }
     CHECK_GT(what[1].length(), 0);
     prev_it = it;
@@ -235,7 +233,7 @@ std::string strip(std::string_view str) {
 std::optional<size_t> inside_string_literal(
     const size_t start,
     const size_t length,
-    const std::vector<std::pair<size_t, size_t>>& literal_positions) {
+    std::vector<std::pair<size_t, size_t>> const& literal_positions) {
   const auto end = start + length;
   for (const auto& literal_position : literal_positions) {
     if (literal_position.first <= start && end <= literal_position.second) {
diff --git a/Shared/StringTransform.h b/Shared/StringTransform.h
index f6642ba9c5..91f14ca410 100644
--- a/Shared/StringTransform.h
+++ b/Shared/StringTransform.h
@@ -56,7 +56,7 @@ std::string format_num_bytes(const size_t bytes);
 std::optional<size_t> inside_string_literal(
     const size_t start,
     const size_t length,
-    const std::vector<std::pair<size_t, size_t>>& literal_positions);
+    std::vector<std::pair<size_t, size_t>> const& literal_positions);
 #endif  // __CUDACC__
 
 template <typename T>
diff --git a/Shared/SystemParameters.h b/Shared/SystemParameters.h
index 9edebe209c..6633230a7e 100644
--- a/Shared/SystemParameters.h
+++ b/Shared/SystemParameters.h
@@ -50,6 +50,10 @@ struct SystemParameters {
   size_t max_cpu_slab_size = size_t(1) << 32;
   // max size of GPU buffer pool memory allocations [bytes], default=4GB
   size_t max_gpu_slab_size = size_t(1) << 32;
+  // Default size of CPU buffer pool memory allocations [bytes], default=4GB
+  size_t default_cpu_slab_size = size_t(1) << 32;
+  // Default size of GPU buffer pool memory allocations [bytes], default=4GB
+  size_t default_gpu_slab_size = size_t(1) << 32;
   double gpu_input_mem_limit = 0.9;  // Punt query to CPU if input mem exceeds % GPU mem
   std::string config_file = "";
   std::string ssl_cert_file = "";    // file path to server's certified PKI certificate
diff --git a/Shared/define_enum_class.h b/Shared/define_enum_class.h
new file mode 100644
index 0000000000..608e14ffda
--- /dev/null
+++ b/Shared/define_enum_class.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright 2023 HEAVY.AI, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file    define_enum_class.h
+ * @brief   Macros/templates for defining enum classes and related utilities.
+ *          Place macro calls in the heavyai namespace so that functions like to_string()
+ *          can be found by the compiler via ADL.
+ *
+ *          Example: HEAVYAI_DEFINE_ENUM_CLASS(Color, Red, Green, Blue)
+ *          Defines:
+ *          1. enum class Color { Red, Green, Blue };
+ *          2. constexpr char const* to_string(Color const);
+ *          3. inline std::ostream &operator<<(std::ostream&, Color const);
+ *
+ *          The macro HEAVYAI_DEFINE_ENUM_CLASS_WITH_DESCRIPTIONS() additionally defines
+ *          4. constexpr char const* to_description(Color const);
+ *
+ *          template <typename Enum>
+ *          constexpr std::optional<Enum> to_enum(std::string_view const name);
+ *          returns the Enum if found by its string representation in O(log(N)) time.
+ *
+ */
+
+#pragma once
+
+#include <boost/preprocessor.hpp>
+
+#include <algorithm>
+#include <array>
+#include <optional>
+#include <ostream>
+#include <string_view>
+
+#define HEAVYAI_DEFINE_ENUM_CLASS(enum_class, ...)                        \
+  enum class enum_class { __VA_ARGS__, N_ };                              \
+                                                                          \
+  constexpr char const* to_string(enum_class const e) {                   \
+    constexpr char const* strings[]{HEAVYAI_QUOTE_EACH(__VA_ARGS__)};     \
+    constexpr size_t nstrings = sizeof(strings) / sizeof(*strings);       \
+    static_assert(nstrings == size_t(enum_class::N_));                    \
+    return strings[size_t(e)];                                            \
+  }                                                                       \
+                                                                          \
+  inline std::ostream& operator<<(std::ostream& os, enum_class const e) { \
+    return os << to_string(e);                                            \
+  }
+
+#define HEAVYAI_DEFINE_ENUM_CLASS_WITH_DESCRIPTIONS(enum_class, ...)   \
+  HEAVYAI_DEFINE_ENUM_CLASS(enum_class, HEAVYAI_PLUCK(0, __VA_ARGS__)) \
+                                                                       \
+  constexpr char const* to_description(enum_class const e) {           \
+    constexpr char const* strings[]{HEAVYAI_PLUCK(1, __VA_ARGS__)};    \
+    constexpr size_t nstrings = sizeof(strings) / sizeof(*strings);    \
+    static_assert(nstrings == size_t(enum_class::N_));                 \
+    return strings[size_t(e)];                                         \
+  }
+
+// Helper macros
+#define HEAVYAI_QUOTE(r, data, i, elem) BOOST_PP_COMMA_IF(i) BOOST_PP_STRINGIZE(elem)
+#define HEAVYAI_QUOTE_EACH(...) \
+  BOOST_PP_SEQ_FOR_EACH_I(HEAVYAI_QUOTE, , BOOST_PP_VARIADIC_TO_SEQ(__VA_ARGS__))
+#define HEAVYAI_PLUCK_ONE(r, j, i, pair) \
+  BOOST_PP_COMMA_IF(i) BOOST_PP_TUPLE_ELEM(2, j, pair)
+#define HEAVYAI_PLUCK(j, ...) \
+  BOOST_PP_SEQ_FOR_EACH_I(HEAVYAI_PLUCK_ONE, j, BOOST_PP_VARIADIC_TO_SEQ(__VA_ARGS__))
+
+namespace heavyai {
+
+// Helper function and struct templates
+template <typename Enum>
+struct StringEnum {
+  std::string_view name;
+  Enum value;
+
+  bool operator<(std::string_view const name) const { return this->name < name; }
+};
+
+template <typename Enum, size_t... I>
+constexpr auto enum_to_array(std::index_sequence<I...>) {
+  return std::array<StringEnum<Enum>, sizeof...(I)>{
+      StringEnum<Enum>{to_string(static_cast<Enum>(I)), static_cast<Enum>(I)}...};
+}
+
+template <typename T, size_t N>
+constexpr void insertion_sort(std::array<T, N>& arr) {
+  for (size_t i = 1; i < N; ++i) {
+    auto key = arr[i];
+    size_t j = i;
+    for (; j && key.name < arr[j - 1].name; --j) {
+      arr[j] = arr[j - 1];
+    }
+    arr[j] = key;
+  }
+}
+
+template <typename Enum>
+constexpr std::array<StringEnum<Enum>, size_t(Enum::N_)> sort_by_name() {
+  auto arr = enum_to_array<Enum>(std::make_index_sequence<size_t(Enum::N_)>());
+  insertion_sort(arr);
+  return arr;
+}
+
+// Return std::optional<Enum> given string name in O(log(Enum::N_)) time and stack space.
+template <typename Enum>
+std::optional<Enum> to_enum(std::string_view const name) {
+  constexpr std::array<StringEnum<Enum>, size_t(Enum::N_)> arr = sort_by_name<Enum>();
+  auto const itr = std::lower_bound(arr.begin(), arr.end(), name);
+  bool const found = itr != arr.end() && itr->name == name;
+  return found ? std::make_optional(itr->value) : std::nullopt;
+}
+
+// Example: IsAny<Color::Red, Color::Green, Color::Blue>::check(Color::Blue);
+template <auto... Values>
+struct IsAny {
+  template <typename T>
+  static bool check(T const value) {
+    // Casting to T allows for safe comparison against out-of-range value.
+    // Example: IsAny<Color::Red, Color::Green, Color::Blue>::check(-1);
+    return ((static_cast<T>(Values) == value) || ...);
+  }
+};
+
+}  // namespace heavyai
diff --git a/Shared/measure.h b/Shared/measure.h
index bde50a1a9e..fc3a758a2b 100644
--- a/Shared/measure.h
+++ b/Shared/measure.h
@@ -51,6 +51,32 @@ typename TypeR::rep timer_stop(Type clock_begin) {
   return duration.count();
 }
 
+template <typename TimeT = std::chrono::milliseconds>
+class Timer {
+ public:
+  Timer() : duration{0}, timer_started_(false) {}
+
+  void start() {
+    start_time_ = timer_start();
+    timer_started_ = true;
+  }
+
+  void stop() {
+    if (!timer_started_) {
+      LOG(WARNING) << " unexpected call to stop on a timer that has not started";
+    }
+    duration += timer_stop(start_time_);
+    timer_started_ = false;
+  }
+
+  typename TimeT::rep elapsed() { return duration; }
+
+ private:
+  std::chrono::steady_clock::time_point start_time_;
+  typename TimeT::rep duration;
+  bool timer_started_;
+};
+
 const auto timer_stop_microseconds =
     timer_stop<std::chrono::steady_clock::time_point, std::chrono::microseconds>;
 
diff --git a/Shared/misc.cpp b/Shared/misc.cpp
index dfa113d231..1ae92e7fa3 100644
--- a/Shared/misc.cpp
+++ b/Shared/misc.cpp
@@ -17,10 +17,12 @@
 // Credits: Howard Hinnant for open source date calculations.
 
 #include "misc.h"
+#include "sqltypes.h"
 
+#include <cctype>
 #include <cstdio>
-
-#include "sqltypes.h"
+#include <fstream>
+#include <iomanip>
 
 namespace shared {
 
@@ -144,4 +146,50 @@ size_t compute_hash(int32_t item_1, int32_t item_2) {
          (static_cast<size_t>(item_2));
 }
 
+// Escape and quote contents of filename as a json string and output to os.
+// Q: Why not just return the file contents as a string?
+// A: Constructing a string may unnecessarily contribute to memory fragmentation,
+//    and is probably less performant due to the extra heap allocations.
+void FileContentsEscaper::quoteAndPrint(std::ostream& os) const {
+  std::ifstream file(filename);
+  if (!file.is_open()) {
+    os << "\"Unable to open " << filename << '"';
+    return;
+  }
+  char ch;
+  std::ios orig_os_state(nullptr);
+  orig_os_state.copyfmt(os);
+  os << '"';
+  while (file.get(ch)) {
+    if (ch == '"') {
+      os << "\\\"";
+    } else if (ch == '\\') {
+      os << "\\\\";
+    } else if (std::isprint(ch) || ch == ' ') {
+      os << ch;
+    } else {
+      switch (ch) {
+        // clang-format off
+        case '\b': os << "\\b"; break;
+        case '\f': os << "\\f"; break;
+        case '\n': os << "\\n"; break;
+        case '\r': os << "\\r"; break;
+        case '\t': os << "\\t"; break;
+        // clang-format on
+        default:
+          os << "\\u" << std::hex << std::setw(4) << std::setfill('0')
+             << static_cast<unsigned>(static_cast<unsigned char>(ch));
+          break;
+      }
+    }
+  }
+  os << '"';
+  os.copyfmt(orig_os_state);
+}
+
+std::ostream& operator<<(std::ostream& os, FileContentsEscaper const& fce) {
+  fce.quoteAndPrint(os);
+  return os;
+}
+
 }  // namespace shared
diff --git a/Shared/misc.h b/Shared/misc.h
index 106abc2126..59f99dedf0 100644
--- a/Shared/misc.h
+++ b/Shared/misc.h
@@ -25,6 +25,7 @@
 #include <iterator>
 #include <list>
 #include <map>
+#include <ostream>
 #include <set>
 #include <string_view>
 #include <unordered_set>
@@ -144,6 +145,14 @@ OSTREAM& operator<<(OSTREAM& os, PrintContainer<CONTAINER> pc) {
   }
 }
 
+// Usage: ostream << FileContentsEscaper{"/path/to/file"};
+struct FileContentsEscaper {
+  char const* const filename;
+  // Escape as a json string
+  void quoteAndPrint(std::ostream&) const;
+};
+std::ostream& operator<<(std::ostream&, FileContentsEscaper const&);
+
 // Same as strftime(buf, max, "%F", tm) but guarantees that the year is
 // zero-padded to a minimum length of 4. Return the number of characters
 // written, not including null byte. If max is not large enough, return 0.
diff --git a/Shared/quantile.h b/Shared/quantile.h
index 20eb1aa5e6..0022572f6c 100644
--- a/Shared/quantile.h
+++ b/Shared/quantile.h
@@ -189,15 +189,16 @@ class TDigest {
   Centroids<RealType, IndexType> centroids_;
   bool forward_{true};  // alternate direction on each call to mergeCentroids().
 #ifndef __CUDACC__
-  std::once_flag merge_buffer_final_once_;
+  std::mutex merge_buffer_final_called_mutex_;
 #endif
+  bool merge_buffer_final_called_{false};
 
   // simple_allocator_, buf_allocate_, centroids_allocate_ are used only by allocate().
-  std::optional<RealType> const q_{std::nullopt};  // Optional preset quantile parameter.
-  bool const use_linear_scaling_function_{false};
-  SimpleAllocator* const simple_allocator_{nullptr};
-  IndexType const buf_allocate_{0};
-  IndexType const centroids_allocate_{0};
+  std::optional<RealType> q_{std::nullopt};  // Optional preset quantile parameter.
+  bool use_linear_scaling_function_{false};
+  SimpleAllocator* simple_allocator_{nullptr};
+  IndexType buf_allocate_{0};
+  IndexType centroids_allocate_{0};
 
   DEVICE RealType max() const {
     return centroids_.max_;
@@ -242,6 +243,25 @@ class TDigest {
       , buf_allocate_(buf_allocate)
       , centroids_allocate_(centroids_allocate) {}
 
+  // Called by approx_quantile_jit_rt().  Move everything except the unmovable mutex.
+  TDigest& operator=(TDigest&& rhs) {
+    buf_ = std::move(rhs.buf_);
+    centroids_ = std::move(rhs.centroids_);
+    forward_ = std::move(rhs.forward_);
+    merge_buffer_final_called_ = std::move(rhs.merge_buffer_final_called_);
+    q_ = std::move(rhs.q_);
+    use_linear_scaling_function_ = std::move(rhs.use_linear_scaling_function_);
+    simple_allocator_ = std::move(rhs.simple_allocator_);
+    buf_allocate_ = std::move(rhs.buf_allocate_);
+    centroids_allocate_ = std::move(rhs.centroids_allocate_);
+    return *this;
+  }
+
+  // Size of reserved buffer+centroids space used for each TDigest in bytes.
+  static IndexType nbytes(IndexType buf_allocate, IndexType centroids_allocate) {
+    return (buf_allocate + centroids_allocate) * (sizeof(RealType) + sizeof(IndexType));
+  }
+
   DEVICE Centroids<RealType, IndexType>& centroids() {
     return centroids_;
   }
@@ -661,16 +681,15 @@ DEVICE void TDigest<RealType, IndexType>::mergeBuffer() {
 // [QE-383] Make concurrent calls to mergeBufferFinal() thread-safe.
 template <typename RealType, typename IndexType>
 DEVICE void TDigest<RealType, IndexType>::mergeBufferFinal() {
-  auto const call_once = [this] {
+#ifndef __CUDACC__
+  std::lock_guard<std::mutex> lock_guard(merge_buffer_final_called_mutex_);
+#endif
+  if (!merge_buffer_final_called_) {
     mergeBuffer();
     assert(centroids_.size() <= buf_.capacity());
     partialSumOfCounts(buf_.counts_.data());
-  };
-#ifdef __CUDACC__
-  call_once();
-#else
-  std::call_once(merge_buffer_final_once_, call_once);
-#endif
+    merge_buffer_final_called_ = true;
+  }
 }
 
 template <typename RealType, typename IndexType>
diff --git a/Shared/sqldefs.h b/Shared/sqldefs.h
index f525339e3a..7e04a18355 100644
--- a/Shared/sqldefs.h
+++ b/Shared/sqldefs.h
@@ -20,8 +20,11 @@
  *
  */
 
-#ifndef SQLDEFS_H
-#define SQLDEFS_H
+#pragma once
+
+#include <cstdio>
+#include <ostream>
+#include <sstream>
 
 // must not change the order without keeping the array in OperExpr::to_string
 // in sync.
@@ -109,21 +112,22 @@ enum class SqlStringOpKind {
   /* 6 args */
   REGEXP_REPLACE,
   REGEXP_SUBSTR,
+  REGEXP_COUNT,
   JSON_VALUE,
   BASE64_ENCODE,
   BASE64_DECODE,
+  URL_ENCODE,
+  URL_DECODE,
   TRY_STRING_CAST,         // string-to-numeric
   POSITION,                // string-to-numeric
   JAROWINKLER_SIMILARITY,  // string-to-numeric
   LEVENSHTEIN_DISTANCE,    // string-to-numeric
+  HASH,                    // string-to-numeric
   INVALID
 };
 
 enum class SqlWindowFunctionKind {
-  // set MIN's enum val as one, and we use window function kind's enum vals
-  // to classify a behavior of our runtime code for window framing
-  // i.e., aggregate_##value_type##_values functions
-  MIN = 1,
+  MIN = 0,
   MAX,
   AVG,
   SUM,
@@ -151,11 +155,52 @@ enum class SqlWindowFunctionKind {
   SUM_INTERNAL,  // For deserialization from Calcite only. Gets rewritten to a regular
                  // SUM.
   CONDITIONAL_CHANGE_EVENT,
-  INVALID
+  UNKNOWN,
 };
 
+constexpr char const* toString(SqlWindowFunctionKind const kind) {
+  constexpr char const* strings[]{"MIN",
+                                  "MAX",
+                                  "AVG",
+                                  "SUM",
+                                  "COUNT",
+                                  "ROW_NUMBER",
+                                  "RANK",
+                                  "DENSE_RANK",
+                                  "PERCENT_RANK",
+                                  "CUME_DIST",
+                                  "NTILE",
+                                  "LAG",
+                                  "LAG_IN_FRAME",
+                                  "LEAD",
+                                  "LEAD_IN_FRAME",
+                                  "FIRST_VALUE",
+                                  "FIRST_VALUE_IN_FRAME",
+                                  "LAST_VALUE",
+                                  "LAST_VALUE_IN_FRAME",
+                                  "NTH_VALUE",
+                                  "NTH_VALUE_IN_FRAME",
+                                  "FORWARD_FILL",
+                                  "BACKWARD_FILL",
+                                  "COUNT_IF",
+                                  "SUM_IF",
+                                  "SUM_INTERNAL",
+                                  "CONDITIONAL_CHANGE_EVENT",
+                                  "UNKNOWN"};
+  constexpr size_t nstrings = ((sizeof strings) / (sizeof *strings));
+  constexpr size_t max_str_idx = nstrings - 1;
+  static_assert(max_str_idx == size_t(SqlWindowFunctionKind::UNKNOWN));
+  return strings[size_t(kind)];
+}
+
+#ifndef __CUDACC__
+inline std::ostream& operator<<(std::ostream& os, SqlWindowFunctionKind const kind) {
+  return os << toString(kind);
+}
+#endif
+
 enum class SqlWindowFrameBoundType {
-  UNBOUNDED_PRECEDING = 1,
+  UNBOUNDED_PRECEDING = 0,
   EXPR_PRECEDING,
   CURRENT_ROW,
   EXPR_FOLLOWING,
@@ -163,6 +208,25 @@ enum class SqlWindowFrameBoundType {
   UNKNOWN
 };
 
+constexpr char const* toString(SqlWindowFrameBoundType const kind) {
+  constexpr char const* strings[]{"UNBOUNDED_PRECEDING",
+                                  "EXPR_PRECEDING",
+                                  "CURRENT_ROW",
+                                  "EXPR_FOLLOWING",
+                                  "UNBOUNDED_FOLLOWING",
+                                  "UNKNOWN"};
+  constexpr size_t nstrings = ((sizeof strings) / (sizeof *strings));
+  constexpr size_t max_str_idx = nstrings - 1;
+  static_assert(max_str_idx == size_t(SqlWindowFrameBoundType::UNKNOWN));
+  return strings[size_t(kind)];
+}
+
+#ifndef __CUDACC__
+inline std::ostream& operator<<(std::ostream& os, SqlWindowFrameBoundType const kind) {
+  return os << toString(kind);
+}
+#endif
+
 enum SQLStmtType { kSELECT, kUPDATE, kINSERT, kDELETE, kCREATE_TABLE };
 
 // StorageOption::kCPU and ::kGPU conflict with libtorch's c10::DeviceType::kCPU
@@ -363,12 +427,18 @@ inline std::ostream& operator<<(std::ostream& os, const SqlStringOpKind kind) {
       return os << "REGEXP_REPLACE";
     case SqlStringOpKind::REGEXP_SUBSTR:
       return os << "REGEXP_SUBSTR";
+    case SqlStringOpKind::REGEXP_COUNT:
+      return os << "REGEXP_COUNT";
     case SqlStringOpKind::JSON_VALUE:
       return os << "JSON_VALUE";
     case SqlStringOpKind::BASE64_ENCODE:
       return os << "BASE64_ENCODE";
     case SqlStringOpKind::BASE64_DECODE:
       return os << "BASE64_DECODE";
+    case SqlStringOpKind::URL_ENCODE:
+      return os << "URL_ENCODE";
+    case SqlStringOpKind::URL_DECODE:
+      return os << "URL_DECODE";
     case SqlStringOpKind::TRY_STRING_CAST:
       return os << "TRY_STRING_CAST";
     case SqlStringOpKind::POSITION:
@@ -377,6 +447,8 @@ inline std::ostream& operator<<(std::ostream& os, const SqlStringOpKind kind) {
       return os << "JAROWINKLER_SIMILARITY";
     case SqlStringOpKind::LEVENSHTEIN_DISTANCE:
       return os << "LEVENSHTEIN_DISTANCE";
+    case SqlStringOpKind::HASH:
+      return os << "HASH";
     case SqlStringOpKind::INVALID:
       return os << "INVALID";
   }
@@ -440,6 +512,9 @@ inline SqlStringOpKind name_to_string_op_kind(const std::string& func_name) {
   if (func_name == "REGEXP_MATCH") {
     return SqlStringOpKind::REGEXP_SUBSTR;
   }
+  if (func_name == "REGEXP_COUNT") {
+    return SqlStringOpKind::REGEXP_COUNT;
+  }
   if (func_name == "JSON_VALUE") {
     return SqlStringOpKind::JSON_VALUE;
   }
@@ -449,6 +524,12 @@ inline SqlStringOpKind name_to_string_op_kind(const std::string& func_name) {
   if (func_name == "BASE64_DECODE") {
     return SqlStringOpKind::BASE64_DECODE;
   }
+  if (func_name == "URL_ENCODE") {
+    return SqlStringOpKind::URL_ENCODE;
+  }
+  if (func_name == "URL_DECODE") {
+    return SqlStringOpKind::URL_DECODE;
+  }
   if (func_name == "TRY_CAST") {
     return SqlStringOpKind::TRY_STRING_CAST;
   }
@@ -461,6 +542,9 @@ inline SqlStringOpKind name_to_string_op_kind(const std::string& func_name) {
   if (func_name == "LEVENSHTEIN_DISTANCE") {
     return SqlStringOpKind::LEVENSHTEIN_DISTANCE;
   }
+  if (func_name == "HASH") {
+    return SqlStringOpKind::HASH;
+  }
   LOG(FATAL) << "Invalid string function " << func_name << ".";
   return SqlStringOpKind::INVALID;
 }
@@ -469,94 +553,14 @@ inline bool string_op_returns_string(const SqlStringOpKind kind) {
   switch (kind) {
     case SqlStringOpKind::TRY_STRING_CAST:
     case SqlStringOpKind::POSITION:
+    case SqlStringOpKind::JAROWINKLER_SIMILARITY:
+    case SqlStringOpKind::LEVENSHTEIN_DISTANCE:
+    case SqlStringOpKind::REGEXP_COUNT:
+    case SqlStringOpKind::HASH:
       return false;
     default:
       return true;
   }
 }
 
-inline std::string toString(const SqlWindowFunctionKind& kind) {
-  switch (kind) {
-    case SqlWindowFunctionKind::ROW_NUMBER:
-      return "ROW_NUMBER";
-    case SqlWindowFunctionKind::RANK:
-      return "RANK";
-    case SqlWindowFunctionKind::DENSE_RANK:
-      return "DENSE_RANK";
-    case SqlWindowFunctionKind::PERCENT_RANK:
-      return "PERCENT_RANK";
-    case SqlWindowFunctionKind::CUME_DIST:
-      return "CUME_DIST";
-    case SqlWindowFunctionKind::NTILE:
-      return "NTILE";
-    case SqlWindowFunctionKind::LAG:
-      return "LAG";
-    case SqlWindowFunctionKind::LEAD:
-      return "LEAD";
-    case SqlWindowFunctionKind::FIRST_VALUE:
-      return "FIRST_VALUE";
-    case SqlWindowFunctionKind::FIRST_VALUE_IN_FRAME:
-      return "FIRST_VALUE_IN_FRAME";
-    case SqlWindowFunctionKind::LAST_VALUE:
-      return "LAST_VALUE";
-    case SqlWindowFunctionKind::LAST_VALUE_IN_FRAME:
-      return "LAST_VALUE_IN_FRAME";
-    case SqlWindowFunctionKind::NTH_VALUE:
-      return "NTH_VALUE";
-    case SqlWindowFunctionKind::NTH_VALUE_IN_FRAME:
-      return "NTH_VALUE_IN_FRAME";
-    case SqlWindowFunctionKind::AVG:
-      return "AVG";
-    case SqlWindowFunctionKind::MIN:
-      return "MIN";
-    case SqlWindowFunctionKind::MAX:
-      return "MAX";
-    case SqlWindowFunctionKind::SUM:
-      return "SUM";
-    case SqlWindowFunctionKind::COUNT:
-      return "COUNT";
-    case SqlWindowFunctionKind::SUM_INTERNAL:
-      return "SUM_INTERNAL";
-    case SqlWindowFunctionKind::LEAD_IN_FRAME:
-      return "LEAD_IN_FRAME";
-    case SqlWindowFunctionKind::LAG_IN_FRAME:
-      return "LAG_IN_FRAME";
-    case SqlWindowFunctionKind::COUNT_IF:
-      return "COUNT_IF";
-    case SqlWindowFunctionKind::SUM_IF:
-      return "SUM_IF";
-    case SqlWindowFunctionKind::FORWARD_FILL:
-      return "FORWARD_FILL";
-    case SqlWindowFunctionKind::BACKWARD_FILL:
-      return "BACKWARD_FILL";
-    case SqlWindowFunctionKind::CONDITIONAL_CHANGE_EVENT:
-      return "CONDITIONAL_CHANGE_EVENT";
-    case SqlWindowFunctionKind::INVALID:
-      return "INVALID";
-  }
-  LOG(FATAL) << "Invalid window function kind.";
-  return "";
-}
-
-inline std::string toString(const SqlWindowFrameBoundType& kind) {
-  switch (kind) {
-    case SqlWindowFrameBoundType::UNBOUNDED_PRECEDING:
-      return "UNBOUNDED_PRECEDING";
-    case SqlWindowFrameBoundType::EXPR_PRECEDING:
-      return "EXPR_PRECEDING";
-    case SqlWindowFrameBoundType::CURRENT_ROW:
-      return "CURRENT_ROW";
-    case SqlWindowFrameBoundType::EXPR_FOLLOWING:
-      return "EXPR_FOLLOWING";
-    case SqlWindowFrameBoundType::UNBOUNDED_FOLLOWING:
-      return "UNBOUNDED_FOLLOWING";
-    case SqlWindowFrameBoundType::UNKNOWN:
-      return "UNKNOWN";
-  }
-  LOG(FATAL) << "Invalid window function bound type.";
-  return "";
-}
-
 #endif  // #if !(defined(__CUDACC__) || defined(NO_BOOST))
-
-#endif  // SQLDEFS_H
diff --git a/Shared/sqltypes.h b/Shared/sqltypes.h
index f8e9dde551..825c5ec85a 100644
--- a/Shared/sqltypes.h
+++ b/Shared/sqltypes.h
@@ -416,6 +416,8 @@ class SQLTypeInfo {
     return false;
   }
 
+  inline bool is_variable_size() const { return size == -1; }
+
   inline int get_logical_size() const {
     if (compression == kENCODING_FIXED || compression == kENCODING_DATE_IN_DAYS) {
       SQLTypeInfo ti(type, dimension, scale, notnull, kENCODING_NONE, 0, subtype);
diff --git a/StringDictionary/StringDictionary.cpp b/StringDictionary/StringDictionary.cpp
index 263e7766d3..afdf53a8f5 100644
--- a/StringDictionary/StringDictionary.cpp
+++ b/StringDictionary/StringDictionary.cpp
@@ -23,6 +23,7 @@
 #include <algorithm>
 #include <boost/filesystem/operations.hpp>
 #include <boost/filesystem/path.hpp>
+#include <boost/iterator/transform_iterator.hpp>
 #include <boost/sort/spreadsort/string_sort.hpp>
 #include <functional>
 #include <future>
@@ -49,6 +50,7 @@
 #include "Utils/StringLike.h"
 
 #include "LeafHostInfo.h"
+#include "Shared/measure.h"
 
 bool g_cache_string_hash{true};
 
@@ -812,80 +814,103 @@ size_t StringDictionary::storageEntryCount() const {
   return str_count_;
 }
 
-namespace {
-
-bool is_like(const std::string& str,
-             const std::string& pattern,
-             const bool icase,
-             const bool is_simple,
-             const char escape) {
-  return icase
-             ? (is_simple ? string_ilike_simple(
-                                str.c_str(), str.size(), pattern.c_str(), pattern.size())
-                          : string_ilike(str.c_str(),
-                                         str.size(),
-                                         pattern.c_str(),
-                                         pattern.size(),
-                                         escape))
-             : (is_simple ? string_like_simple(
-                                str.c_str(), str.size(), pattern.c_str(), pattern.size())
-                          : string_like(str.c_str(),
-                                        str.size(),
-                                        pattern.c_str(),
-                                        pattern.size(),
-                                        escape));
+template <typename T>
+std::vector<T> StringDictionary::getLikeImpl(const std::string& pattern,
+                                             const bool icase,
+                                             const bool is_simple,
+                                             const char escape,
+                                             const size_t generation) const {
+  constexpr size_t grain_size{1000};
+  auto is_like_impl = icase       ? is_simple ? string_ilike_simple : string_ilike
+                      : is_simple ? string_like_simple
+                                  : string_like;
+  auto const num_threads = static_cast<size_t>(cpu_threads());
+  std::vector<std::vector<T>> worker_results(num_threads);
+  tbb::task_arena limited_arena(num_threads);
+  limited_arena.execute([&] {
+    tbb::parallel_for(
+        tbb::blocked_range<size_t>(0, generation, grain_size),
+        [&is_like_impl, &pattern, &escape, &worker_results, this](
+            const tbb::blocked_range<size_t>& range) {
+          auto& result_vector =
+              worker_results[tbb::this_task_arena::current_thread_index()];
+          for (size_t i = range.begin(); i < range.end(); ++i) {
+            const auto str = getStringUnlocked(i);
+            if (is_like_impl(
+                    str.c_str(), str.size(), pattern.c_str(), pattern.size(), escape)) {
+              result_vector.push_back(i);
+            }
+          }
+        });
+  });
+  // partial_sum to get 1) a start offset for each thread and 2) the total # elems
+  std::vector<size_t> start_offsets(num_threads + 1, 0);
+  auto vec_size = [](std::vector<T> const& vec) { return vec.size(); };
+  auto begin = boost::make_transform_iterator(worker_results.begin(), vec_size);
+  auto end = boost::make_transform_iterator(worker_results.end(), vec_size);
+  std::partial_sum(begin, end, start_offsets.begin() + 1);  // first element is 0
+
+  std::vector<T> result(start_offsets[num_threads]);
+  limited_arena.execute([&] {
+    tbb::parallel_for(
+        tbb::blocked_range<size_t>(0, num_threads, 1),
+        [&worker_results, &result, &start_offsets](
+            const tbb::blocked_range<size_t>& range) {
+          auto& result_vector = worker_results[range.begin()];
+          auto const start_offset = start_offsets[range.begin()];
+          std::copy(
+              result_vector.begin(), result_vector.end(), result.begin() + start_offset);
+        },
+        tbb::static_partitioner());
+  });
+  return result;
 }
-
-}  // namespace
-
-std::vector<int32_t> StringDictionary::getLike(const std::string& pattern,
-                                               const bool icase,
-                                               const bool is_simple,
-                                               const char escape,
-                                               const size_t generation) const {
+template <>
+std::vector<int32_t> StringDictionary::getLike<int32_t>(const std::string& pattern,
+                                                        const bool icase,
+                                                        const bool is_simple,
+                                                        const char escape,
+                                                        const size_t generation) const {
   std::lock_guard<std::shared_mutex> write_lock(rw_mutex_);
   if (isClient()) {
-    return client_->get_like(pattern, icase, is_simple, escape, generation);
+    return client_->get_like_i32(pattern, icase, is_simple, escape, generation);
   }
   const auto cache_key = std::make_tuple(pattern, icase, is_simple, escape);
-  const auto it = like_cache_.find(cache_key);
-  if (it != like_cache_.end()) {
+  const auto it = like_i32_cache_.find(cache_key);
+  if (it != like_i32_cache_.end()) {
     return it->second;
   }
-  std::vector<int32_t> result;
-  std::vector<std::thread> workers;
-  int worker_count = cpu_threads();
-  CHECK_GT(worker_count, 0);
-  std::vector<std::vector<int32_t>> worker_results(worker_count);
-  CHECK_LE(generation, str_count_);
-  for (int worker_idx = 0; worker_idx < worker_count; ++worker_idx) {
-    workers.emplace_back([&worker_results,
-                          &pattern,
-                          generation,
-                          icase,
-                          is_simple,
-                          escape,
-                          worker_idx,
-                          worker_count,
-                          this]() {
-      for (size_t string_id = worker_idx; string_id < generation;
-           string_id += worker_count) {
-        const auto str = getStringUnlocked(string_id);
-        if (is_like(str, pattern, icase, is_simple, escape)) {
-          worker_results[worker_idx].push_back(string_id);
-        }
-      }
-    });
-  }
-  for (auto& worker : workers) {
-    worker.join();
+
+  auto result = getLikeImpl<int32_t>(pattern, icase, is_simple, escape, generation);
+  // place result into cache for reuse if similar query
+  const auto it_ok = like_i32_cache_.insert(std::make_pair(cache_key, result));
+  like_cache_size_ += (pattern.size() + 3 + (result.size() * sizeof(int32_t)));
+
+  CHECK(it_ok.second);
+
+  return result;
+}
+
+template <>
+std::vector<int64_t> StringDictionary::getLike<int64_t>(const std::string& pattern,
+                                                        const bool icase,
+                                                        const bool is_simple,
+                                                        const char escape,
+                                                        const size_t generation) const {
+  std::lock_guard<std::shared_mutex> write_lock(rw_mutex_);
+  if (isClient()) {
+    return client_->get_like_i64(pattern, icase, is_simple, escape, generation);
   }
-  for (const auto& worker_result : worker_results) {
-    result.insert(result.end(), worker_result.begin(), worker_result.end());
+  const auto cache_key = std::make_tuple(pattern, icase, is_simple, escape);
+  const auto it = like_i64_cache_.find(cache_key);
+  if (it != like_i64_cache_.end()) {
+    return it->second;
   }
+
+  auto result = getLikeImpl<int64_t>(pattern, icase, is_simple, escape, generation);
   // place result into cache for reuse if similar query
-  const auto it_ok = like_cache_.insert(std::make_pair(cache_key, result));
-  like_cache_size_ += (pattern.size() + 3 + (result.size() * sizeof(int32_t)));
+  const auto it_ok = like_i64_cache_.insert(std::make_pair(cache_key, result));
+  like_cache_size_ += (pattern.size() + 3 + (result.size() * sizeof(int64_t)));
 
   CHECK(it_ok.second);
 
@@ -1599,8 +1624,11 @@ void* StringDictionary::addMemoryCapacity(void* addr,
 }
 
 void StringDictionary::invalidateInvertedIndex() noexcept {
-  if (!like_cache_.empty()) {
-    decltype(like_cache_)().swap(like_cache_);
+  if (!like_i32_cache_.empty()) {
+    decltype(like_i32_cache_)().swap(like_i32_cache_);
+  }
+  if (!like_i64_cache_.empty()) {
+    decltype(like_i64_cache_)().swap(like_i64_cache_);
   }
   if (!regex_cache_.empty()) {
     decltype(regex_cache_)().swap(regex_cache_);
@@ -2081,6 +2109,7 @@ void translate_string_ids(std::vector<int32_t>& dest_ids,
 }
 
 size_t StringDictionary::computeCacheSize() const {
+  std::shared_lock<std::shared_mutex> read_lock(rw_mutex_);
   return string_id_string_dict_hash_table_.size() * sizeof(int32_t) +
          hash_cache_.size() * sizeof(string_dict_hash_t) +
          sorted_cache.size() * sizeof(int32_t) + like_cache_size_ + regex_cache_size_ +
diff --git a/StringDictionary/StringDictionary.h b/StringDictionary/StringDictionary.h
index e6e53763d3..99c6873b19 100644
--- a/StringDictionary/StringDictionary.h
+++ b/StringDictionary/StringDictionary.h
@@ -99,11 +99,19 @@ class StringDictionary {
   std::pair<char*, size_t> getStringBytes(int32_t string_id) const noexcept;
   size_t storageEntryCount() const;
 
-  std::vector<int32_t> getLike(const std::string& pattern,
-                               const bool icase,
-                               const bool is_simple,
-                               const char escape,
-                               const size_t generation) const;
+  template <typename T>
+  std::vector<T> getLike(const std::string& pattern,
+                         const bool icase,
+                         const bool is_simple,
+                         const char escape,
+                         const size_t generation) const;
+
+  template <typename T>
+  std::vector<T> getLikeImpl(const std::string& pattern,
+                             const bool icase,
+                             const bool is_simple,
+                             const char escape,
+                             const size_t generation) const;
 
   std::vector<int32_t> getCompare(const std::string& pattern,
                                   const std::string& comp_operator,
@@ -284,7 +292,9 @@ class StringDictionary {
   size_t payload_file_off_;
   mutable std::shared_mutex rw_mutex_;
   mutable std::map<std::tuple<std::string, bool, bool, char>, std::vector<int32_t>>
-      like_cache_;
+      like_i32_cache_;
+  mutable std::map<std::tuple<std::string, bool, bool, char>, std::vector<int64_t>>
+      like_i64_cache_;
   mutable size_t like_cache_size_;
   mutable std::map<std::pair<std::string, char>, std::vector<int32_t>> regex_cache_;
   mutable size_t regex_cache_size_;
diff --git a/StringDictionary/StringDictionaryClient.h b/StringDictionary/StringDictionaryClient.h
index 13c3846beb..e2edfe1429 100644
--- a/StringDictionary/StringDictionaryClient.h
+++ b/StringDictionary/StringDictionaryClient.h
@@ -47,14 +47,24 @@ class StringDictionaryClient {
     return 0;
   };
 
-  std::vector<int32_t> get_like(const std::string& pattern,
-                                const bool icase,
-                                const bool is_simple,
-                                const char escape,
-                                const int64_t generation) {
+  std::vector<int32_t> get_like_i32(const std::string& pattern,
+                                    const bool icase,
+                                    const bool is_simple,
+                                    const char escape,
+                                    const int64_t generation) {
     CHECK(false);
     return std::vector<int32_t>{};
-  };
+  }
+
+  std::vector<int64_t> get_like_i64(const std::string& pattern,
+                                    const bool icase,
+                                    const bool is_simple,
+                                    const char escape,
+                                    const int64_t generation) {
+    CHECK(false);
+    return std::vector<int64_t>{};
+  }
+
 
   std::vector<int32_t> get_compare(const std::string& pattern,
                                    const std::string& comp_operator,
diff --git a/StringDictionary/StringDictionaryProxy.cpp b/StringDictionary/StringDictionaryProxy.cpp
index c6e52a30c4..32c9e34f89 100644
--- a/StringDictionary/StringDictionaryProxy.cpp
+++ b/StringDictionary/StringDictionaryProxy.cpp
@@ -465,46 +465,37 @@ StringDictionaryProxy::IdMap StringDictionaryProxy::buildUnionTranslationMapToOt
   return id_map;
 }
 
-namespace {
-
-bool is_like(const std::string& str,
-             const std::string& pattern,
-             const bool icase,
-             const bool is_simple,
-             const char escape) {
-  return icase
-             ? (is_simple ? string_ilike_simple(
-                                str.c_str(), str.size(), pattern.c_str(), pattern.size())
-                          : string_ilike(str.c_str(),
-                                         str.size(),
-                                         pattern.c_str(),
-                                         pattern.size(),
-                                         escape))
-             : (is_simple ? string_like_simple(
-                                str.c_str(), str.size(), pattern.c_str(), pattern.size())
-                          : string_like(str.c_str(),
-                                        str.size(),
-                                        pattern.c_str(),
-                                        pattern.size(),
-                                        escape));
-}
-
-}  // namespace
-
-std::vector<int32_t> StringDictionaryProxy::getLike(const std::string& pattern,
-                                                    const bool icase,
-                                                    const bool is_simple,
-                                                    const char escape) const {
+template <typename T>
+std::vector<T> StringDictionaryProxy::getLike(const std::string& pattern,
+                                              const bool icase,
+                                              const bool is_simple,
+                                              const char escape) const {
   CHECK_GE(generation_, 0);
-  auto result = string_dict_->getLike(pattern, icase, is_simple, escape, generation_);
+  auto result = string_dict_->getLike<T>(pattern, icase, is_simple, escape, generation_);
+  auto is_like_impl = icase       ? is_simple ? string_ilike_simple : string_ilike
+                      : is_simple ? string_like_simple
+                                  : string_like;
   for (unsigned index = 0; index < transient_string_vec_.size(); ++index) {
-    if (is_like(*transient_string_vec_[index], pattern, icase, is_simple, escape)) {
+    auto const str = *transient_string_vec_[index];
+    if (is_like_impl(str.c_str(), str.size(), pattern.c_str(), pattern.size(), escape)) {
       result.push_back(transientIndexToId(index));
     }
   }
   return result;
 }
 
+template std::vector<int32_t> StringDictionaryProxy::getLike<int32_t>(
+    const std::string& pattern,
+    const bool icase,
+    const bool is_simple,
+    const char escape) const;
+
+template std::vector<int64_t> StringDictionaryProxy::getLike<int64_t>(
+    const std::string& pattern,
+    const bool icase,
+    const bool is_simple,
+    const char escape) const;
+
 namespace {
 
 bool do_compare(const std::string& str,
diff --git a/StringDictionary/StringDictionaryProxy.h b/StringDictionary/StringDictionaryProxy.h
index 102c72c489..633bf9afde 100644
--- a/StringDictionary/StringDictionaryProxy.h
+++ b/StringDictionary/StringDictionaryProxy.h
@@ -207,10 +207,11 @@ class StringDictionaryProxy {
 
   void updateGeneration(const int64_t generation) noexcept;
 
-  std::vector<int32_t> getLike(const std::string& pattern,
-                               const bool icase,
-                               const bool is_simple,
-                               const char escape) const;
+  template <typename T>
+  std::vector<T> getLike(const std::string& pattern,
+                         const bool icase,
+                         const bool is_simple,
+                         const char escape) const;
 
   std::vector<int32_t> getCompare(const std::string& pattern,
                                   const std::string& comp_operator) const;
diff --git a/StringOps/StringOps.cpp b/StringOps/StringOps.cpp
index c7eb852c2f..2f047c4abb 100644
--- a/StringOps/StringOps.cpp
+++ b/StringOps/StringOps.cpp
@@ -315,6 +315,26 @@ Datum LevenshteinDistance::numericEval(const std::string_view str1,
   return return_datum;
 }
 
+NullableStrType Hash::operator()(const std::string& str) const {
+  UNREACHABLE() << "Invalid string output for Hash";
+  return {};
+}
+
+Datum Hash::numericEval(const std::string_view str) const {
+  if (str.empty()) {
+    return NullDatum(return_ti_);
+  } else {
+    uint64_t str_hash = 1;
+    // rely on fact that unsigned overflow is defined and wraps
+    for (size_t i = 0; i < str.size(); ++i) {
+      str_hash = str_hash * 997u + static_cast<unsigned char>(str[i]);
+    }
+    Datum return_datum;
+    return_datum.bigintval = static_cast<int64_t>(str_hash);
+    return return_datum;
+  }
+}
+
 NullableStrType Lower::operator()(const std::string& str) const {
   std::string output_str(str);
   std::transform(
@@ -672,6 +692,31 @@ std::pair<bool, int64_t> RegexpSubstr::set_sub_match_info(
       true, sub_match_group_idx > 0L ? sub_match_group_idx - 1 : sub_match_group_idx);
 }
 
+NullableStrType RegexpCount::operator()(const std::string& str) const {
+  UNREACHABLE() << "Invalid string output for RegexpCount";
+  return {};
+}
+
+Datum RegexpCount::numericEval(const std::string_view str_view) const {
+  if (str_view.empty()) {
+    return NullDatum(return_ti_);
+  }
+
+  Datum return_datum;
+  const int64_t str_len = str_view.size();
+  const int64_t pos = start_pos_ < 0 ? str_len + start_pos_ : start_pos_;
+  const size_t wrapped_start = std::clamp(pos, int64_t(0), str_len);
+  auto search_start = str_view.data() + wrapped_start;
+  auto search_end = str_view.data() + str_len;
+  boost::cregex_iterator iter(search_start, search_end, regex_pattern_);
+  boost::cregex_iterator end;
+
+  int64_t num_matches = std::distance(iter, end);
+  return_datum.bigintval = num_matches;
+
+  return return_datum;
+}
+
 // json_path must start with "lax $", "strict $" or "$" (case-insensitive).
 JsonValue::JsonParseMode JsonValue::parse_json_parse_mode(std::string_view json_path) {
   size_t const string_pos = json_path.find('$');
@@ -832,6 +877,78 @@ NullableStrType Base64Decode::operator()(const std::string& str) const {
   return shared::decode_base64(str);
 }
 
+namespace {
+// Unreserved characters https://www.rfc-editor.org/rfc/rfc3986#section-2.3
+bool is_normal(char const c) {
+  return std::isalnum(c) || c == '-' || c == '.' || c == '_' || c == '~';
+}
+
+// True iff c will be encoded into a single character.
+bool is_singular(char const c) {
+  return is_normal(c) || c == ' ';
+}
+
+// Count % chars that are eligible to begin a url-encoded triplet.
+size_t count_percents(std::string const& str) {
+  size_t n_percents = 0u;
+  if (2u < str.size()) {
+    for (size_t i = 0u; i < str.size() - 2u; ++i) {
+      if (str[i] == '%') {
+        ++n_percents;
+        i += 2u;
+      }
+    }
+  }
+  return n_percents;
+}
+
+// If hex is a hex digit, return int value from 0-15. Otherwise undefined.
+int nibble(char const hex) {
+  return 'A' <= hex ? std::toupper(hex) + (10 - 'A') : hex - '0';
+}
+}  // namespace
+
+// Encode unreserved characters (RFC 3986 Sec. 2.3) into themselves.
+// Encode space ' ' into plus '+'.
+// Encode all other characters c into "%XX" where XX is the hex value of c.
+NullableStrType UrlEncode::operator()(const std::string& str) const {
+  constexpr char const* tr = "0123456789ABCDEF";
+  // Number of characters in string that will be copied/translated into a single char.
+  size_t const n_singular = std::count_if(str.begin(), str.end(), is_singular);
+  std::string encoded;
+  encoded.reserve(str.size() + 2u * (str.size() - n_singular));
+  for (char const c : str) {
+    if (is_normal(c)) {
+      encoded.append(1u, c);
+    } else if (c == ' ') {
+      encoded.append(1u, '+');
+    } else {
+      encoded.append(1u, '%');
+      encoded.append(1u, tr[(c >> 4) & 0xf]);
+      encoded.append(1u, tr[c & 0xf]);
+    }
+  }
+  return encoded;
+}
+
+// Inverse of UrlEncode::operator(). Garbage in, garbage out, but must never segfault.
+NullableStrType UrlDecode::operator()(const std::string& str) const {
+  size_t const n_percents = count_percents(str);
+  std::string decoded;
+  decoded.reserve(str.size() - 2u * n_percents);
+  for (size_t i = 0u; i < str.size(); ++i) {
+    if (str[i] == '%' && i + 2u < str.size()) {
+      decoded.append(1u, nibble(str[i + 1u]) << 4 ^ nibble(str[i + 2u]));
+      i += 2u;  // Skip the two hexadecimal digits
+    } else if (str[i] == '+') {
+      decoded.append(1u, ' ');
+    } else {  // Append normal characters, or % if one of last two characters.
+      decoded.append(1u, str[i]);
+    }
+  }
+  return decoded;
+}
+
 std::string StringOps::operator()(const std::string& str) const {
   NullableStrType modified_str(str);
   if (modified_str.is_null) {
@@ -925,7 +1042,8 @@ std::unique_ptr<const StringOp> gen_string_op(const StringOpInfo& string_op_info
   const auto& return_ti = string_op_info.getReturnType();
 
   if (string_op_info.hasNullLiteralArg()) {
-    return std::make_unique<const NullOp>(var_string_optional_literal, op_kind);
+    return std::make_unique<const NullOp>(
+        return_ti, var_string_optional_literal, op_kind);
   }
 
   const auto num_non_variable_literals = string_op_info.numNonVariableLiterals();
@@ -1067,6 +1185,17 @@ std::unique_ptr<const StringOp> gen_string_op(const StringOpInfo& string_op_info
                                                   regex_params_literal,
                                                   sub_match_idx_literal);
     }
+    case SqlStringOpKind::REGEXP_COUNT: {
+      CHECK_GE(num_non_variable_literals, 3UL);
+      CHECK_LE(num_non_variable_literals, 3UL);
+      const auto pattern_literal = string_op_info.getStringLiteral(1);
+      const auto start_pos_literal = string_op_info.getIntLiteral(2);
+      const auto regex_params_literal = string_op_info.getStringLiteral(3);
+      return std::make_unique<const RegexpCount>(var_string_optional_literal,
+                                                 pattern_literal,
+                                                 start_pos_literal,
+                                                 regex_params_literal);
+    }
     case SqlStringOpKind::JSON_VALUE: {
       CHECK_EQ(num_non_variable_literals, 1UL);
       const auto json_path_literal = string_op_info.getStringLiteral(1);
@@ -1081,6 +1210,14 @@ std::unique_ptr<const StringOp> gen_string_op(const StringOpInfo& string_op_info
       CHECK_EQ(num_non_variable_literals, 0UL);
       return std::make_unique<const Base64Decode>(var_string_optional_literal);
     }
+    case SqlStringOpKind::URL_ENCODE: {
+      CHECK_EQ(num_non_variable_literals, 0UL);
+      return std::make_unique<const UrlEncode>(var_string_optional_literal);
+    }
+    case SqlStringOpKind::URL_DECODE: {
+      CHECK_EQ(num_non_variable_literals, 0UL);
+      return std::make_unique<const UrlDecode>(var_string_optional_literal);
+    }
     case SqlStringOpKind::TRY_STRING_CAST: {
       CHECK_EQ(num_non_variable_literals, 0UL);
       return std::make_unique<const TryStringCast>(return_ti,
@@ -1122,13 +1259,14 @@ std::unique_ptr<const StringOp> gen_string_op(const StringOpInfo& string_op_info
         return std::make_unique<const LevenshteinDistance>(var_string_optional_literal);
       }
     }
-    default: {
-      UNREACHABLE();
-      return std::make_unique<NullOp>(var_string_optional_literal, op_kind);
+    case SqlStringOpKind::HASH: {
+      CHECK_EQ(num_non_variable_literals, 0UL);
+      return std::make_unique<const Hash>(var_string_optional_literal);
     }
+    default:
+      UNREACHABLE();
+      return {};
   }
-  // Make compiler happy
-  return std::make_unique<NullOp>(var_string_optional_literal, op_kind);
 }
 
 std::pair<std::string, bool /* is null */> apply_string_op_to_literals(
diff --git a/StringOps/StringOps.h b/StringOps/StringOps.h
index 76a68cf517..0fcdca31bb 100644
--- a/StringOps/StringOps.h
+++ b/StringOps/StringOps.h
@@ -52,7 +52,7 @@ struct StringOp {
   StringOp(const SqlStringOpKind op_kind,
            const std::optional<std::string>& var_str_optional_literal)
       : op_kind_(op_kind)
-      , return_ti_(SQLTypeInfo(kTEXT))
+      , return_ti_(SQLTypeInfo(kTEXT, false, kENCODING_DICT))
       , has_var_str_literal_(var_str_optional_literal.has_value())
       , var_str_literal_(!var_str_optional_literal.has_value()
                              ? NullableStrType()
@@ -80,10 +80,10 @@ struct StringOp {
   }
 
   virtual NullableStrType operator()() const {
-    CHECK(hasVarStringLiteral());
     if (var_str_literal_.is_null) {
       return var_str_literal_;
     }
+    CHECK(hasVarStringLiteral());
     return operator()(var_str_literal_.str);
   }
 
@@ -101,10 +101,10 @@ struct StringOp {
   }
 
   virtual Datum numericEval() const {
-    CHECK(hasVarStringLiteral());
     if (var_str_literal_.is_null) {
       return NullDatum(return_ti_);
     }
+    CHECK(hasVarStringLiteral());
     return numericEval(var_str_literal_.str);
   }
 
@@ -206,6 +206,15 @@ struct LevenshteinDistance : public StringOp {
   const std::string str_literal_;
 };
 
+struct Hash : public StringOp {
+ public:
+  Hash(const std::optional<std::string>& var_str_optional_literal)
+      : StringOp(SqlStringOpKind::HASH, SQLTypeInfo(kBIGINT), var_str_optional_literal) {}
+
+  NullableStrType operator()(const std::string& str) const override;
+  Datum numericEval(const std::string_view str) const override;
+};
+
 struct Lower : public StringOp {
   Lower(const std::optional<std::string>& var_str_optional_literal)
       : StringOp(SqlStringOpKind::LOWER, var_str_optional_literal) {}
@@ -439,8 +448,7 @@ struct RegexpSubstr : public StringOp {
                const std::string& regex_params,
                const int64_t sub_match_group_idx)
       : StringOp(SqlStringOpKind::REGEXP_SUBSTR, var_str_optional_literal)
-      , regex_pattern_str_(
-            regex_pattern)  // for toString() as std::regex does not have str() method
+      , regex_pattern_str_(regex_pattern)
       , regex_pattern_(
             StringOp::generateRegex("REGEXP_SUBSTR", regex_pattern, regex_params, true))
       , start_pos_(start_pos > 0 ? start_pos - 1 : start_pos)
@@ -472,8 +480,7 @@ struct RegexpReplace : public StringOp {
                 const int64_t occurrence,
                 const std::string& regex_params)
       : StringOp(SqlStringOpKind::REGEXP_REPLACE, var_str_optional_literal)
-      , regex_pattern_str_(
-            regex_pattern)  // for toString() as std::regex does not have str() method
+      , regex_pattern_str_(regex_pattern)
       , regex_pattern_(
             StringOp::generateRegex("REGEXP_REPLACE", regex_pattern, regex_params, false))
       , replacement_(replacement)
@@ -495,6 +502,29 @@ struct RegexpReplace : public StringOp {
   const int64_t occurrence_;
 };
 
+struct RegexpCount : public StringOp {
+ public:
+  RegexpCount(const std::optional<std::string>& var_str_optional_literal,
+              const std::string& regex_pattern,
+              const int64_t start_pos,
+              const std::string& regex_params)
+      : StringOp(SqlStringOpKind::REGEXP_COUNT,
+                 SQLTypeInfo(kBIGINT),
+                 var_str_optional_literal)
+      , regex_pattern_str_(regex_pattern)
+      , regex_pattern_(
+            StringOp::generateRegex("REGEXP_COUNT", regex_pattern, regex_params, true))
+      , start_pos_(start_pos > 0 ? start_pos - 1 : start_pos) {}
+
+  NullableStrType operator()(const std::string& str) const override;
+  Datum numericEval(const std::string_view str) const override;
+
+ private:
+  const std::string regex_pattern_str_;
+  const boost::regex regex_pattern_;
+  const int64_t start_pos_;
+};
+
 // We currently do not allow strict mode JSON parsing per the SQL standard, as
 // 1) We can't throw run-time errors in the case that the string operator
 // is evaluated in an actual kernel, which is the case for none-encoded text
@@ -577,10 +607,26 @@ struct Base64Decode : public StringOp {
   NullableStrType operator()(const std::string& str) const override;
 };
 
+struct UrlEncode : public StringOp {
+  UrlEncode(const std::optional<std::string>& var_str_optional_literal)
+      : StringOp(SqlStringOpKind::URL_ENCODE, var_str_optional_literal) {}
+
+  NullableStrType operator()(const std::string& str) const override;
+};
+
+struct UrlDecode : public StringOp {
+  UrlDecode(const std::optional<std::string>& var_str_optional_literal)
+      : StringOp(SqlStringOpKind::URL_DECODE, var_str_optional_literal) {}
+
+  NullableStrType operator()(const std::string& str) const override;
+};
+
 struct NullOp : public StringOp {
-  NullOp(const std::optional<std::string>& var_str_optional_literal,
+  NullOp(const SQLTypeInfo& return_ti,
+         const std::optional<std::string>& var_str_optional_literal,
          const SqlStringOpKind op_kind)
-      : StringOp(SqlStringOpKind::INVALID, var_str_optional_literal), op_kind_(op_kind) {}
+      : StringOp(SqlStringOpKind::INVALID, return_ti, var_str_optional_literal)
+      , op_kind_(op_kind) {}
 
   NullableStrType operator()(const std::string& str) const override {
     return NullableStrType();  // null string
diff --git a/Tests/ArrowCsvForeignStorageTest.cpp b/Tests/ArrowCsvForeignStorageTest.cpp
index fc5bf88ec0..f93253d649 100644
--- a/Tests/ArrowCsvForeignStorageTest.cpp
+++ b/Tests/ArrowCsvForeignStorageTest.cpp
@@ -559,6 +559,38 @@ TEST_F(BooleanTest, CheckWithoutNulls) {
                        {1, 1, 1, 0, 0, 1, 0, 1});
 }
 
+class CreateDataframeTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    ASSERT_NO_THROW(run_ddl_statement("drop table if exists test_dataframe;"));
+  }
+
+  void TearDown() override {
+    ASSERT_NO_THROW(run_ddl_statement("drop table if exists test_dataframe;"));
+  }
+
+  void queryAndAssertPartialException(const std::string& query,
+                                      const std::string& error_message) {
+    try {
+      run_ddl_statement(query);
+      FAIL() << "An exception should have been thrown for this test case";
+    } catch (const std::exception& e) {
+      std::string exception_message{e.what()};
+      ASSERT_TRUE(exception_message.find(error_message) != std::string::npos)
+          << "Exception message: " << exception_message
+          << ", expected partial error message: " << error_message;
+    }
+  }
+};
+TEST_F(CreateDataframeTest, CreateOrReplaceDataframe) {
+  queryAndAssertPartialException(
+      "CREATE OR REPLACE DATAFRAME test_dataframe(idx "
+      "integer) FROM 'CSV:../../Tests/FsiDataFiles/0.csv';",
+      R"(SQL Error: Encountered "DATAFRAME" at line 1, column 19.
+Was expecting:
+    "MODEL" ...)");
+}
+
 }  // namespace
 
 int main(int argc, char** argv) {
@@ -578,6 +610,7 @@ int main(int argc, char** argv) {
 
   logger::LogOptions log_options(argv[0]);
   log_options.max_files_ = 0;  // stderr only by default
+  log_options.set_base_path(BASE_PATH);
   desc.add(log_options.get_options());
 
   po::variables_map vm;
diff --git a/Tests/BufferMgrTest.cpp b/Tests/BufferMgrTest.cpp
index 1dd9f3ffc0..509d9be6c0 100644
--- a/Tests/BufferMgrTest.cpp
+++ b/Tests/BufferMgrTest.cpp
@@ -315,6 +315,7 @@ class BufferMgrTest : public testing::TestWithParam<MgrType> {
       size_t max_buffer_pool_size = max_buffer_pool_size_,
       size_t min_slab_size = min_slab_size_,
       size_t max_slab_size = max_slab_size_,
+      size_t default_slab_size = default_slab_size_,
       size_t page_size = page_size_) {
     auto mgr_type = GetParam();
     if (mgr_type == MgrType::CPU_MGR) {
@@ -323,6 +324,7 @@ class BufferMgrTest : public testing::TestWithParam<MgrType> {
                                                               nullptr,
                                                               min_slab_size,
                                                               max_slab_size,
+                                                              default_slab_size,
                                                               page_size,
                                                               &mock_parent_mgr_);
 #ifdef HAVE_CUDA
@@ -333,6 +335,7 @@ class BufferMgrTest : public testing::TestWithParam<MgrType> {
                                                                   mock_cuda_mgr_.get(),
                                                                   min_slab_size,
                                                                   max_slab_size,
+                                                                  default_slab_size,
                                                                   page_size,
                                                                   &mock_parent_mgr_);
 #endif
@@ -424,6 +427,7 @@ class BufferMgrTest : public testing::TestWithParam<MgrType> {
   void assertExpectedBufferMgrAttributes(size_t used_size = test_buffer_size_,
                                          size_t allocated_size = max_slab_size_,
                                          size_t num_chunks = 1,
+                                         size_t slab_count = 1,
                                          bool is_allocation_capped = false) {
     EXPECT_EQ(buffer_mgr_->getInUseSize(), used_size);
     EXPECT_EQ(buffer_mgr_->getNumChunks(), num_chunks);
@@ -434,6 +438,7 @@ class BufferMgrTest : public testing::TestWithParam<MgrType> {
       EXPECT_TRUE(buffer_mgr_->getSlabSegments().empty());
     } else {
       EXPECT_FALSE(buffer_mgr_->getSlabSegments().empty());
+      EXPECT_EQ(buffer_mgr_->getSlabSegments().size(), slab_count);
     }
   }
 
@@ -513,6 +518,7 @@ class BufferMgrTest : public testing::TestWithParam<MgrType> {
   static constexpr size_t max_buffer_pool_size_{1000};
   static constexpr size_t min_slab_size_{100};
   static constexpr size_t max_slab_size_{500};
+  static constexpr size_t default_slab_size_{500};
   static constexpr size_t page_size_{10};
   static constexpr size_t test_buffer_size_{100};
   static inline const ChunkKey test_chunk_key_{1, 1, 1, 1};
@@ -531,6 +537,7 @@ TEST_P(BufferMgrTest, CreateBufferMgr) {
                                 test_max_buffer_pool_size,
                                 test_min_slab_size,
                                 test_max_slab_size,
+                                test_max_slab_size,
                                 test_page_size);
   EXPECT_EQ(buffer_mgr_->getDeviceId(), test_device_id);
   EXPECT_EQ(buffer_mgr_->getMaxBufferSize(), test_max_buffer_pool_size);
@@ -622,7 +629,34 @@ TEST_P(BufferMgrTest, CreateBufferExistingSlabWithoutSufficientFreeSegment) {
 
   assertExpectedBufferAttributes(buffer);
   assertExpectedBufferMgrAttributes(
-      max_slab_size_ - page_size_ + test_buffer_size_, 2 * max_slab_size_, 2);
+      max_slab_size_ - page_size_ + test_buffer_size_, 2 * max_slab_size_, 2, 2);
+}
+
+TEST_P(BufferMgrTest, CreateBufferNewSlabCreationAtMaxBufferPoolSize) {
+  const auto max_buffer_pool_size = 3 * test_buffer_size_;
+  const auto max_slab_size = 2 * test_buffer_size_;
+  buffer_mgr_ = createBufferMgr(
+      device_id_, max_buffer_pool_size, test_buffer_size_, max_slab_size, max_slab_size);
+  buffer_mgr_->createBuffer(test_chunk_key_, page_size_, test_buffer_size_);
+  buffer_mgr_->createBuffer(test_chunk_key_2_, page_size_, test_buffer_size_);
+
+  assertSegmentCount(2);
+  assertSegmentAttributes(
+      0, 0, Buffer_Namespace::USED, test_chunk_key_, test_buffer_size_);
+  assertSegmentAttributes(
+      0, 1, Buffer_Namespace::USED, test_chunk_key_2_, test_buffer_size_);
+  assertExpectedBufferMgrAttributes(max_slab_size, max_slab_size, 2, 1);
+
+  buffer_mgr_->createBuffer(test_chunk_key_3_, page_size_, test_buffer_size_);
+
+  assertSegmentCount(3);
+  assertSegmentAttributes(
+      0, 0, Buffer_Namespace::USED, test_chunk_key_, test_buffer_size_);
+  assertSegmentAttributes(
+      0, 1, Buffer_Namespace::USED, test_chunk_key_2_, test_buffer_size_);
+  assertSegmentAttributes(
+      1, 0, Buffer_Namespace::USED, test_chunk_key_3_, test_buffer_size_);
+  assertExpectedBufferMgrAttributes(max_buffer_pool_size, max_buffer_pool_size, 3, 2);
 }
 
 TEST_P(BufferMgrTest, CreateBufferNewSlabCreationError) {
@@ -638,7 +672,7 @@ TEST_P(BufferMgrTest, CreateBufferNewSlabCreationError) {
       OutOfMemory);
   EXPECT_FALSE(buffer_mgr_->isBufferOnDevice(test_chunk_key_2_));
 
-  assertExpectedBufferMgrAttributes(max_slab_size_, max_slab_size_, 1, true);
+  assertExpectedBufferMgrAttributes(max_slab_size_, max_slab_size_, 1, 1, true);
 }
 
 TEST_P(BufferMgrTest, CreateBufferCannotCreateFirstSlabError) {
@@ -650,7 +684,7 @@ TEST_P(BufferMgrTest, CreateBufferCannotCreateFirstSlabError) {
                FailedToCreateFirstSlab);
   EXPECT_FALSE(buffer_mgr_->isBufferOnDevice(test_chunk_key_));
 
-  assertExpectedBufferMgrAttributes(0, 0, 0, true);
+  assertExpectedBufferMgrAttributes(0, 0, 0, 0, true);
 }
 
 TEST_P(BufferMgrTest, CreateBufferEviction) {
@@ -663,6 +697,7 @@ TEST_P(BufferMgrTest, CreateBufferEviction) {
                                 test_max_buffer_pool_size,
                                 test_min_slab_size,
                                 test_max_slab_size,
+                                test_max_slab_size,
                                 page_size_);
   createUnpinnedBuffers(2);
 
@@ -680,6 +715,102 @@ TEST_P(BufferMgrTest, CreateBufferEviction) {
   assertExpectedBufferMgrAttributes(2 * test_buffer_size_, test_max_slab_size, 2);
 }
 
+TEST_P(BufferMgrTest, CreateBufferSizeAboveDefaultSlabSize) {
+  auto max_buffer_pool_size = 5 * test_buffer_size_;
+  auto min_slab_size = test_buffer_size_;
+  auto default_slab_size = 2 * test_buffer_size_;
+  auto buffer_size = 3 * test_buffer_size_;
+  auto max_slab_size = 4 * test_buffer_size_;
+
+  buffer_mgr_ = createBufferMgr(
+      device_id_, max_buffer_pool_size, min_slab_size, max_slab_size, default_slab_size);
+  EXPECT_FALSE(buffer_mgr_->isBufferOnDevice(test_chunk_key_));
+
+  auto buffer = buffer_mgr_->createBuffer(test_chunk_key_, page_size_, buffer_size);
+  EXPECT_TRUE(buffer_mgr_->isBufferOnDevice(test_chunk_key_));
+
+  assertExpectedBufferAttributes(buffer, true, buffer_size / page_size_, buffer_size);
+  assertExpectedBufferMgrAttributes(buffer_size, buffer_size);
+}
+
+TEST_P(BufferMgrTest, CreateBufferSizeEqualsDefaultSlabSize) {
+  auto max_buffer_pool_size = 5 * test_buffer_size_;
+  auto min_slab_size = test_buffer_size_;
+  auto default_slab_size = 2 * test_buffer_size_;
+  auto max_slab_size = 4 * test_buffer_size_;
+
+  buffer_mgr_ = createBufferMgr(
+      device_id_, max_buffer_pool_size, min_slab_size, max_slab_size, default_slab_size);
+  EXPECT_FALSE(buffer_mgr_->isBufferOnDevice(test_chunk_key_));
+
+  auto buffer = buffer_mgr_->createBuffer(test_chunk_key_, page_size_, default_slab_size);
+  EXPECT_TRUE(buffer_mgr_->isBufferOnDevice(test_chunk_key_));
+
+  assertExpectedBufferAttributes(
+      buffer, true, default_slab_size / page_size_, default_slab_size);
+  assertExpectedBufferMgrAttributes(default_slab_size, default_slab_size);
+}
+
+TEST_P(BufferMgrTest, CreateBufferSizeBelowDefaultSlabSize) {
+  auto max_buffer_pool_size = 5 * test_buffer_size_;
+  auto min_slab_size = test_buffer_size_;
+  auto default_slab_size = 2 * test_buffer_size_;
+  auto buffer_size = test_buffer_size_;
+  auto max_slab_size = 4 * test_buffer_size_;
+
+  buffer_mgr_ = createBufferMgr(
+      device_id_, max_buffer_pool_size, min_slab_size, max_slab_size, default_slab_size);
+  EXPECT_FALSE(buffer_mgr_->isBufferOnDevice(test_chunk_key_));
+
+  auto buffer = buffer_mgr_->createBuffer(test_chunk_key_, page_size_, buffer_size);
+  EXPECT_TRUE(buffer_mgr_->isBufferOnDevice(test_chunk_key_));
+
+  assertExpectedBufferAttributes(buffer, true, buffer_size / page_size_, buffer_size);
+  assertExpectedBufferMgrAttributes(buffer_size, default_slab_size);
+}
+
+// This test case covers the use case where the last allocation is less than both the
+// default slab size and initial max slab size. This occurs when the available space
+// in the buffer pool is less than both of these sizes (current_max_num_pages_per_slab_ is
+// updated to match available space in this case).
+TEST_P(BufferMgrTest, CreateBufferCurrentMaxSlabSizeLessThanDefaultSlabSize) {
+  auto max_buffer_pool_size = 3 * test_buffer_size_;
+  auto min_slab_size = test_buffer_size_;
+  auto default_slab_size = 2 * test_buffer_size_;
+  auto max_slab_size = 3 * test_buffer_size_;
+
+  buffer_mgr_ = createBufferMgr(
+      device_id_, max_buffer_pool_size, min_slab_size, max_slab_size, default_slab_size);
+  EXPECT_FALSE(buffer_mgr_->isBufferOnDevice(test_chunk_key_));
+
+  auto buffer = buffer_mgr_->createBuffer(test_chunk_key_, page_size_, default_slab_size);
+  EXPECT_TRUE(buffer_mgr_->isBufferOnDevice(test_chunk_key_));
+  assertExpectedBufferAttributes(
+      buffer, true, default_slab_size / page_size_, default_slab_size);
+
+  assertSegmentCount(1);
+  assertSegmentAttributes(
+      0, 0, Buffer_Namespace::USED, test_chunk_key_, default_slab_size);
+
+  assertExpectedBufferMgrAttributes(default_slab_size, default_slab_size);
+
+  EXPECT_FALSE(buffer_mgr_->isBufferOnDevice(test_chunk_key_2_));
+
+  auto buffer_2 =
+      buffer_mgr_->createBuffer(test_chunk_key_2_, page_size_, test_buffer_size_);
+  EXPECT_TRUE(buffer_mgr_->isBufferOnDevice(test_chunk_key_2_));
+  assertExpectedBufferAttributes(buffer_2);
+
+  assertSegmentCount(2);
+  assertSegmentAttributes(
+      0, 0, Buffer_Namespace::USED, test_chunk_key_, default_slab_size);
+  assertSegmentAttributes(
+      1, 0, Buffer_Namespace::USED, test_chunk_key_2_, test_buffer_size_);
+
+  auto total_buffer_size = default_slab_size + test_buffer_size_;
+  assertExpectedBufferMgrAttributes(total_buffer_size, total_buffer_size, 2, 2);
+}
+
 TEST_P(BufferMgrTest, ClearSlabs) {
   buffer_mgr_ = createBufferMgr();
   createUnpinnedBuffers(1);
@@ -1386,7 +1517,7 @@ TEST_P(BufferMgrTest, ReserveBufferFreeSegmentInSubsequentSlab) {
       1, 1, Buffer_Namespace::USED, test_chunk_key_2_, test_buffer_size_);
   assertSegmentAttributes(1, 2, Buffer_Namespace::FREE);
   assertExpectedBufferMgrAttributes(
-      max_slab_size_ + (2 * test_buffer_size_), 2 * max_slab_size_, 3);
+      max_slab_size_ + (2 * test_buffer_size_), 2 * max_slab_size_, 3, 2);
 
   auto slab_2_remaining_size = max_slab_size_ - (2 * test_buffer_size_);
 
@@ -1422,7 +1553,10 @@ TEST_P(BufferMgrTest, ReserveBufferFreeSegmentInSubsequentSlab) {
   ASSERT_EQ(segment_it->buffer, nullptr);
 
   assertExpectedBufferMgrAttributes(
-      max_slab_size_ + slab_2_remaining_size + test_buffer_size_, 2 * max_slab_size_, 3);
+      max_slab_size_ + slab_2_remaining_size + test_buffer_size_,
+      2 * max_slab_size_,
+      3,
+      2);
 }
 
 TEST_P(BufferMgrTest, ReserveBufferNewSlabCreation) {
@@ -1436,7 +1570,7 @@ TEST_P(BufferMgrTest, ReserveBufferNewSlabCreation) {
   assertSegmentAttributes(
       0, 1, Buffer_Namespace::USED, test_chunk_key_2_, test_buffer_size_);
   assertSegmentAttributes(0, 2, Buffer_Namespace::FREE);
-  assertExpectedBufferMgrAttributes(2 * test_buffer_size_, max_slab_size_, 2);
+  assertExpectedBufferMgrAttributes(2 * test_buffer_size_, max_slab_size_, 2, 1);
 
   auto segment_it = getSegmentAt(0, 0);
   segment_it = buffer_mgr_->reserveBuffer(segment_it, max_slab_size_);
@@ -1448,7 +1582,7 @@ TEST_P(BufferMgrTest, ReserveBufferNewSlabCreation) {
   assertSegmentAttributes(0, 2, Buffer_Namespace::FREE);
   assertSegmentAttributes(1, 0, Buffer_Namespace::USED, test_chunk_key_, max_slab_size_);
   assertExpectedBufferMgrAttributes(
-      max_slab_size_ + test_buffer_size_, 2 * max_slab_size_, 2);
+      max_slab_size_ + test_buffer_size_, 2 * max_slab_size_, 2, 2);
 }
 
 TEST_P(BufferMgrTest, ReserveBufferNewSlabCreationError) {
@@ -1501,7 +1635,7 @@ TEST_P(BufferMgrTest, ReserveBufferNewSlabCreationPreviouslyEmptyBuffer) {
   assertSegmentAttributes(0, 1, Buffer_Namespace::FREE);
   assertSegmentAttributes(1, 0, Buffer_Namespace::USED, test_chunk_key_, max_slab_size_);
   assertExpectedBufferMgrAttributes(
-      max_slab_size_ + test_buffer_size_, 2 * max_slab_size_, 2);
+      max_slab_size_ + test_buffer_size_, 2 * max_slab_size_, 2, 2);
 }
 
 TEST_P(BufferMgrTest, ReserveBufferWithBufferEviction) {
@@ -1514,6 +1648,7 @@ TEST_P(BufferMgrTest, ReserveBufferWithBufferEviction) {
                                 test_max_buffer_pool_size,
                                 test_min_slab_size,
                                 test_max_slab_size,
+                                test_max_slab_size,
                                 page_size_);
 
   constexpr size_t test_buffer_size = test_max_slab_size / 4;
@@ -1552,6 +1687,7 @@ TEST_P(BufferMgrTest,
                                 test_max_buffer_pool_size,
                                 test_min_slab_size,
                                 test_max_slab_size,
+                                test_max_slab_size,
                                 page_size_);
   createUnpinnedBuffers(10);
   setSegmentScores({
@@ -1562,7 +1698,8 @@ TEST_P(BufferMgrTest,
   assertSegmentCount(10);
   assertSegmentAttributes(
       0, 0, Buffer_Namespace::USED, test_chunk_key_, test_buffer_size_);
-  assertExpectedBufferMgrAttributes(10 * test_buffer_size_, 2 * test_max_slab_size, 10);
+  assertExpectedBufferMgrAttributes(
+      10 * test_buffer_size_, 2 * test_max_slab_size, 10, 2);
 
   auto segment_it = getSegmentAt(0, 0);
   segment_it->buffer->pin();
@@ -1573,7 +1710,7 @@ TEST_P(BufferMgrTest,
   assertSegmentAttributes(0, 0, Buffer_Namespace::FREE, {}, test_buffer_size_);
   assertSegmentAttributes(
       1, 2, Buffer_Namespace::USED, test_chunk_key_, test_buffer_size_ * 3);
-  assertExpectedBufferMgrAttributes(9 * test_buffer_size_, 2 * test_max_slab_size, 7);
+  assertExpectedBufferMgrAttributes(9 * test_buffer_size_, 2 * test_max_slab_size, 7, 2);
 }
 
 TEST_P(BufferMgrTest, ReserveBufferWithBufferEvictionLastSegmentEvicted) {
@@ -1586,6 +1723,7 @@ TEST_P(BufferMgrTest, ReserveBufferWithBufferEvictionLastSegmentEvicted) {
                                 test_max_buffer_pool_size,
                                 test_min_slab_size,
                                 test_max_slab_size,
+                                test_max_slab_size,
                                 page_size_);
 
   constexpr size_t test_buffer_size = test_max_slab_size / 3;
@@ -1621,6 +1759,7 @@ TEST_P(BufferMgrTest, ReserveBufferWithBufferEvictionNoBufferToEvict) {
                                 test_max_buffer_pool_size,
                                 test_min_slab_size,
                                 test_max_slab_size,
+                                test_max_slab_size,
                                 page_size_);
 
   constexpr size_t test_buffer_size = test_max_slab_size / 4;
@@ -1931,7 +2070,7 @@ TEST_P(BufferMgrTest, GetBufferReentrantReserveFreeSegmentInSubsequentSlab) {
   buffer_mgr_->createBuffer(test_chunk_key_2_, page_size_, test_buffer_size_);
 
   assertExpectedBufferMgrAttributes(
-      max_slab_size_ + (2 * test_buffer_size_), 2 * max_slab_size_, 3);
+      max_slab_size_ + (2 * test_buffer_size_), 2 * max_slab_size_, 3, 2);
 
   auto new_reserved_size = max_slab_size_ - (2 * test_buffer_size_);
   mock_parent_mgr_.setReserveSize(new_reserved_size);
@@ -1947,7 +2086,7 @@ TEST_P(BufferMgrTest, GetBufferReentrantReserveFreeSegmentInSubsequentSlab) {
   EXPECT_EQ(read_content, new_source_content);
 
   assertExpectedBufferMgrAttributes(
-      max_slab_size_ + new_reserved_size + test_buffer_size_, 2 * max_slab_size_, 3);
+      max_slab_size_ + new_reserved_size + test_buffer_size_, 2 * max_slab_size_, 3, 2);
   assertParentMethodCalledWithParams(
       ParentMgrMethod::kFetchBuffer,
       {{test_chunk_key_, buffer, new_source_content.size()}});
@@ -1973,7 +2112,7 @@ TEST_P(BufferMgrTest, GetBufferReentrantReserveNewSlabCreation) {
   EXPECT_EQ(read_content, new_source_content);
 
   assertExpectedBufferMgrAttributes(
-      max_slab_size_ + test_buffer_size_, 2 * max_slab_size_, 2);
+      max_slab_size_ + test_buffer_size_, 2 * max_slab_size_, 2, 2);
   assertParentMethodCalledWithParams(
       ParentMgrMethod::kFetchBuffer,
       {{test_chunk_key_, buffer, new_source_content.size()}});
@@ -2011,6 +2150,7 @@ TEST_P(BufferMgrTest, GetBufferReentrantReserveWithEviction) {
                                 test_max_buffer_pool_size,
                                 test_min_slab_size,
                                 test_max_slab_size,
+                                test_max_slab_size,
                                 page_size_);
   auto buffer_1_size = test_max_slab_size * 3 / 4;
   auto buffer_1 = buffer_mgr_->createBuffer(test_chunk_key_, page_size_, buffer_1_size);
@@ -2051,6 +2191,7 @@ TEST_P(BufferMgrTest, GetBufferReentrantReserveWithEvictionOfLastSegmentInSlab)
                                 test_max_buffer_pool_size,
                                 test_min_slab_size,
                                 test_max_slab_size,
+                                test_max_slab_size,
                                 page_size_);
 
   auto buffer_1_size = test_max_slab_size / 4;
@@ -2093,6 +2234,7 @@ TEST_P(BufferMgrTest, GetBufferReentrantReserveCannotEvict) {
                                 test_max_buffer_pool_size,
                                 test_min_slab_size,
                                 test_max_slab_size,
+                                test_max_slab_size,
                                 page_size_);
 
   createPinnedBuffers(2);
diff --git a/Tests/BumpAllocatorTest.cpp b/Tests/BumpAllocatorTest.cpp
index 625bfb8095..fdf18c24a8 100644
--- a/Tests/BumpAllocatorTest.cpp
+++ b/Tests/BumpAllocatorTest.cpp
@@ -280,6 +280,7 @@ int main(int argc, char** argv) {
 
   logger::LogOptions log_options(argv[0]);
   log_options.max_files_ = 0;  // stderr only by default
+  log_options.set_base_path(BASE_PATH);
   desc.add(log_options.get_options());
 
   po::variables_map vm;
diff --git a/Tests/CMakeLists.txt b/Tests/CMakeLists.txt
index df07b882a8..b20cf1b6dd 100644
--- a/Tests/CMakeLists.txt
+++ b/Tests/CMakeLists.txt
@@ -791,6 +791,11 @@ target_link_libraries(TableUpdateDeleteBenchmark benchmark ${EXECUTE_TEST_LIBS})
 
 ##########
 
+add_executable(OneDALBenchmark OneDALBenchmark.cpp)
+target_link_libraries(OneDALBenchmark benchmark ${EXECUTE_TEST_LIBS})
+
+##########
+
 add_executable(UtilTest UtilTest.cpp)
 target_link_libraries(UtilTest Utils gtest Logger Shared ${Boost_LIBRARIES} OSDependent)
 add_test(UtilTest UtilTest ${TEST_ARGS})
diff --git a/Tests/CachingFileMgrTest.cpp b/Tests/CachingFileMgrTest.cpp
index 6cfd6bf5bf..92dd1a48e3 100644
--- a/Tests/CachingFileMgrTest.cpp
+++ b/Tests/CachingFileMgrTest.cpp
@@ -55,7 +55,7 @@ class CachingFileMgrTest : public testing::Test {
   // Keep page size small for these tests so we can hit limits more easily.
   static constexpr size_t page_size_ = 64;
   static constexpr size_t page_data_size_ =
-      page_size_ - fn::FileBuffer::headerBufferOffset_;
+      page_size_ - fn::FileBuffer::kHeaderBufferOffset;
   static constexpr size_t data_file_size_ =
       page_size_ * fn::CachingFileMgr::DEFAULT_NUM_PAGES_PER_DATA_FILE;
   static constexpr size_t meta_file_size_ =
diff --git a/Tests/CatalogMigrationTest.cpp b/Tests/CatalogMigrationTest.cpp
index f0e4a37628..8e2ac58e4e 100644
--- a/Tests/CatalogMigrationTest.cpp
+++ b/Tests/CatalogMigrationTest.cpp
@@ -317,13 +317,15 @@ class ForeignTablesTest : public DBHandlerTestFixture {
   }
 
   void TearDown() override {
+    g_enable_fsi = true;
+    g_enable_system_tables = true;
+    g_enable_s3_fsi = true;
     dropTestTables();
     DBHandlerTestFixture::TearDown();
   }
 
  private:
   void dropTestTables() {
-    g_enable_fsi = true;
     sql("DROP FOREIGN TABLE IF EXISTS test_foreign_table;");
     sql("DROP TABLE IF EXISTS test_table;");
     sql("DROP VIEW IF EXISTS test_view;");
@@ -347,6 +349,10 @@ TEST_F(ForeignTablesTest, ForeignTablesAreNotDroppedWhenFsiIsDisabled) {
   ASSERT_NE(nullptr, getCatalog().getMetadataForTable("test_view", false));
 
   g_enable_fsi = false;
+  // The following flags should be false when FSI is disabled.
+  g_enable_system_tables = false;
+  g_enable_s3_fsi = false;
+
   resetCatalog();
   loginAdmin();
 
diff --git a/Tests/ColumnarResultsTest.cpp b/Tests/ColumnarResultsTest.cpp
index f35a1e2e9c..73a27d4c16 100644
--- a/Tests/ColumnarResultsTest.cpp
+++ b/Tests/ColumnarResultsTest.cpp
@@ -21,6 +21,7 @@
 #include "QueryEngine/ResultSet.h"
 #include "QueryEngine/TargetValue.h"
 #include "Shared/TargetInfo.h"
+#include "Tests/DataMgrTestHelpers.h"
 #include "Tests/ResultSetTestUtils.h"
 #include "Tests/TestHelpers.h"
 
@@ -40,6 +41,7 @@ class ColumnarResultsTester : public ColumnarResults {
                         num_columns,
                         target_types,
                         Executor::UNITARY_EXECUTOR_ID,
+                        0,
                         is_parallel_execution_enforced) {}
 
   template <typename ENTRY_TYPE>
@@ -70,8 +72,8 @@ void test_columnar_conversion(const std::vector<TargetInfo>& target_infos,
                               const QueryMemoryDescriptor& query_mem_desc,
                               const size_t non_empty_step_size,
                               const bool is_parallel_conversion = false) {
-  auto row_set_mem_owner = std::make_shared<RowSetMemoryOwner>(
-      Executor::getArenaBlockSize(), 0, /*num_threads=*/1);
+  auto row_set_mem_owner =
+      std::make_shared<RowSetMemoryOwner>(Executor::getArenaBlockSize(), 0);
   ResultSet result_set(
       target_infos, ExecutorDeviceType::CPU, query_mem_desc, row_set_mem_owner, 0, 0);
 
@@ -163,8 +165,8 @@ TEST(Construct, Empty) {
   std::vector<TargetInfo> target_infos;
   std::vector<SQLTypeInfo> sql_type_infos;
   QueryMemoryDescriptor query_mem_desc;
-  auto row_set_mem_owner = std::make_shared<RowSetMemoryOwner>(
-      Executor::getArenaBlockSize(), 0, /*num_threads=*/1);
+  auto row_set_mem_owner =
+      std::make_shared<RowSetMemoryOwner>(Executor::getArenaBlockSize(), 0);
   ResultSet result_set(
       target_infos, ExecutorDeviceType::CPU, query_mem_desc, row_set_mem_owner, 0, 0);
   ColumnarResultsTester columnar_results(
@@ -839,6 +841,7 @@ int main(int argc, char** argv) {
 
   TestHelpers::init_logger_stderr_only(argc, argv);
   testing::InitGoogleTest(&argc, argv);
+  TestHelpers::init_sys_catalog();
 
   int err{0};
   try {
diff --git a/Tests/CommandLineTest.cpp b/Tests/CommandLineTest.cpp
index 1d48594cb3..5ec13b806d 100644
--- a/Tests/CommandLineTest.cpp
+++ b/Tests/CommandLineTest.cpp
@@ -85,6 +85,30 @@ class CommandLineTestcase {
   std::string std_out_line_, std_err_line_, std_out_string_ = "", std_err_string_ = "";
   bp::ipstream std_out_pipe_, std_err_pipe_;
 
+#ifdef HAVE_ASAN
+  // clang-format off
+  // If any ASAN suppressions were used, it will be added to stderr. Example:
+  // -----------------------------------------------------
+  // Suppressions used:
+  //   count      bytes template
+  //       6        144 _GLOBAL__sub_I_common.cpp
+  //       3         72 _GLOBAL__sub_I_register_serializable.cpp
+  //       2         48 oneapi::dal::detail::serializable_registry::register_default_factory
+  // -----------------------------------------------------
+  // Erase these from std_err_string.
+  // clang-format on
+  void erase_asan_report(std::string& std_err_string) {
+    std::string const delimiter = "-----------------------------------------------------";
+    size_t start_pos = std_err_string.find(delimiter);
+    if (start_pos != std::string::npos) {
+      size_t end_pos = std_err_string.find(delimiter, start_pos + delimiter.length());
+      if (end_pos != std::string::npos) {
+        std_err_string.erase(start_pos, end_pos - start_pos + delimiter.length());
+      }
+    }
+  }
+#endif
+
   // Runs the testcase and evalutates return code, stdErr, and stdOut.
   void evaluate() {
     int returnCode = bp::system(executable_.string() + " " + flags,
@@ -96,6 +120,9 @@ class CommandLineTestcase {
     while (std::getline(std_err_pipe_, std_err_line_)) {
       std_err_string_ += std_err_line_;
     }
+#ifdef HAVE_ASAN
+    erase_asan_report(std_err_string_);
+#endif
     // Since we are using raw strings, prune out any newlines.
     boost::erase_all(expected_std_out_, "\n");
     boost::erase_all(expected_std_err_, "\n");
diff --git a/Tests/ComputeMetadataTest.cpp b/Tests/ComputeMetadataTest.cpp
index 09ff2f0156..68ec1dfca4 100644
--- a/Tests/ComputeMetadataTest.cpp
+++ b/Tests/ComputeMetadataTest.cpp
@@ -29,6 +29,7 @@
 #endif
 
 extern float g_vacuum_min_selectivity;
+extern bool g_use_cpu_mem_pool_for_output_buffers;
 
 namespace {
 
@@ -1870,6 +1871,13 @@ class OpportunisticVacuumingMemoryUseTest : public OpportunisticVacuumingTest {
     auto& data_mgr = getCatalog().getDataMgr();
     data_mgr.resetBufferMgrs(disk_cache_config, 0, system_parameters);
   }
+
+  void removeFragmenterForTable(const std::string& table_name) {
+    auto& catalog = getCatalog();
+    auto table_id = catalog.getTableId(table_name);
+    CHECK(table_id.has_value());
+    catalog.removeFragmenterForTable(table_id.value());
+  }
 };
 
 TEST_F(OpportunisticVacuumingMemoryUseTest, PulledInChunkDeleted) {
@@ -1973,7 +1981,25 @@ TEST_F(OpportunisticVacuumingMemoryUseTest, StaleChunkOnGpuDeleted) {
                       {{i(3)}, {i(4)}, {i(5)}, {i(6)}, {i(7)}, {i(8)}});
 }
 
-TEST_F(OpportunisticVacuumingMemoryUseTest, OneFragmentProcessedAtATime) {
+class OpportunisticVacuumingMemoryUseParamTest
+    : public OpportunisticVacuumingMemoryUseTest,
+      public testing::WithParamInterface<bool> {
+ public:
+  void SetUp() override {
+    g_use_cpu_mem_pool_for_output_buffers = GetParam();
+    OpportunisticVacuumingMemoryUseTest::SetUp();
+  }
+
+  static void TearDownTestSuite() {
+    g_use_cpu_mem_pool_for_output_buffers = orig_use_cpu_mem_pool_for_output_buffers;
+  }
+
+ private:
+  inline static const bool orig_use_cpu_mem_pool_for_output_buffers{
+      g_use_cpu_mem_pool_for_output_buffers};
+};
+
+TEST_P(OpportunisticVacuumingMemoryUseParamTest, OneFragmentProcessedAtATime) {
   sql("create table test_table (i int, t text encoding none) with (fragment_size = 5);");
   insertRange(0, 9, "a");
 
@@ -1985,19 +2011,41 @@ TEST_F(OpportunisticVacuumingMemoryUseTest, OneFragmentProcessedAtATime) {
   // Auto-vacuuming should pull in additional chunks of 34 bytes i.e. index chunk of 4
   // bytes (initial offset) + 5 * 4 bytes and data chunk of 5 * 2 bytes per fragment.
   system_parameters.max_cpu_slab_size = 84;
+  system_parameters.default_cpu_slab_size = system_parameters.max_cpu_slab_size;
   resetBufferMgrs(system_parameters);
 
   sql("delete from test_table where i <= 1 or i >= 8;");
 
-  // Assert that only one slab was used.
+  // One slab is used for chunks and 2 additional slabs are used for query output buffers.
   auto memory_info_vector =
       getCatalog().getDataMgr().getMemoryInfo(MemoryLevel::CPU_LEVEL);
   ASSERT_EQ(memory_info_vector.size(), size_t(1));
   for (const auto& memory_data : memory_info_vector[0].nodeMemoryData) {
-    EXPECT_EQ(memory_data.slabNum, size_t(0));
+    if (g_use_cpu_mem_pool_for_output_buffers) {
+      EXPECT_LE(memory_data.slabNum, size_t(2));
+      // The second and third slabs should be used for query output buffers that are freed
+      // at the end of the query.
+      if (memory_data.slabNum == 1 || memory_data.slabNum == 2) {
+        EXPECT_EQ(memory_data.memStatus, Buffer_Namespace::MemStatus::FREE);
+      }
+    } else {
+      EXPECT_EQ(memory_data.slabNum, size_t(0));
+    }
   }
 
-  sqlAndCompareResult("select * from test_table order by i;",
+  constexpr auto select_query{"select * from test_table order by i;"};
+  if (g_use_cpu_mem_pool_for_output_buffers) {
+    // An error should occur due to required output buffer size exceeding the max slab
+    // size.
+    queryAndAssertException(
+        select_query, "OUT_OF_CPU_MEM: Not enough host memory to execute the query");
+
+    // Reset to the default buffer manager parameters in order to ensure that the max slab
+    // size is big enough to contain the output buffers.
+    removeFragmenterForTable("test_table");
+    resetBufferMgrs(getSystemParameters());
+  }
+  sqlAndCompareResult(select_query,
                       {{i(2), "a2"},
                        {i(3), "a3"},
                        {i(4), "a4"},
@@ -2006,6 +2054,14 @@ TEST_F(OpportunisticVacuumingMemoryUseTest, OneFragmentProcessedAtATime) {
                        {i(7), "a7"}});
 }
 
+INSTANTIATE_TEST_SUITE_P(DifferentOutputBufferAllocators,
+                         OpportunisticVacuumingMemoryUseParamTest,
+                         testing::Bool(),
+                         [](const auto& param_info) {
+                           return (param_info.param ? "LegacyAllocator"
+                                                    : "CpuBufferMgrAllocator");
+                         });
+
 int main(int argc, char** argv) {
   TestHelpers::init_logger_stderr_only(argc, argv);
   testing::InitGoogleTest(&argc, argv);
diff --git a/Tests/CorrelatedSubqueryTest.cpp b/Tests/CorrelatedSubqueryTest.cpp
index 8c6f63d441..f47f7cab67 100644
--- a/Tests/CorrelatedSubqueryTest.cpp
+++ b/Tests/CorrelatedSubqueryTest.cpp
@@ -1646,6 +1646,40 @@ TEST(Select, InExpr_As_Child_Operand_Of_OR_Operator) {
   check_query(q4, true);
 }
 
+TEST(Select, NotSupportedDecorrelation) {
+  auto drop_table = []() {
+    for (std::string tbl : {"test_decor1", "test_decor2", "test_decor3"}) {
+      QR::get()->runDDLStatement("DROP TABLE IF EXISTS " + tbl + ";");
+    }
+  };
+  ScopeGuard drop_tbls = [drop_table] { drop_table(); };
+  drop_table();
+  QR::get()->runDDLStatement("CREATE TABLE test_decor1 (a int, b int, c int);");
+  QR::get()->runDDLStatement("CREATE TABLE test_decor2 (d int, e int, f int);");
+  QR::get()->runDDLStatement("CREATE TABLE test_decor3 (g int, h int, i int);");
+
+  EXPECT_ANY_THROW(QR::get()->runSQL(
+      "select COUNT(c) from test_decor1 where b > 0 and a in (select d from test_decor2 "
+      "where e > 0 and c in (select i from test_decor3));",
+      ExecutorDeviceType::CPU));
+  EXPECT_NO_THROW(
+      QR::get()->runSQL("select COUNT(c) from test_decor1 where b > 0 and a in (select d "
+                        "from test_decor2 where c in (select i from test_decor3));",
+                        ExecutorDeviceType::CPU));
+  ScopeGuard reset_flag = [orig = g_enable_watchdog]() { g_enable_watchdog = orig; };
+  std::string const q3 =
+      "select COUNT(c) from test_decor1 where b > 0 and a in (select d AS dd from "
+      "test_decor2 where c in (select i from test_decor3));";
+  for (bool watchdog : {false, true}) {
+    g_enable_watchdog = watchdog;
+    if (watchdog) {
+      EXPECT_ANY_THROW(QR::get()->runSQL(q3, ExecutorDeviceType::CPU));
+    } else {
+      EXPECT_NO_THROW(QR::get()->runSQL(q3, ExecutorDeviceType::CPU));
+    }
+  }
+}
+
 int main(int argc, char* argv[]) {
   testing::InitGoogleTest(&argc, argv);
   TestHelpers::init_logger_stderr_only(argc, argv);
diff --git a/Tests/CreateAndDropTableDdlTest.cpp b/Tests/CreateAndDropTableDdlTest.cpp
index 43d48ac4e8..372fe20e77 100644
--- a/Tests/CreateAndDropTableDdlTest.cpp
+++ b/Tests/CreateAndDropTableDdlTest.cpp
@@ -27,6 +27,7 @@
 #include "Catalog/ForeignTable.h"
 #include "Catalog/TableDescriptor.h"
 #include "DBHandlerTestHelpers.h"
+#include "DataMgr/FileMgr/FileBuffer.h"
 #include "Fragmenter/FragmentDefaultValues.h"
 #include "Shared/SysDefinitions.h"
 #include "Shared/scope.h"
@@ -75,16 +76,27 @@ class CreateAndDropTableDdlTest : public DBHandlerTestFixture {
   std::string getCreateTableQuery(const ddl_utils::TableType table_type,
                                   const std::string& table_name,
                                   const std::string& columns,
-                                  bool if_not_exists = false) {
-    return getCreateTableQuery(table_type, table_name, columns, {}, if_not_exists);
+                                  bool if_not_exists = false,
+                                  bool or_replace = false,
+                                  bool temporary_table = false) const {
+    return getCreateTableQuery(
+        table_type, table_name, columns, {}, if_not_exists, or_replace, temporary_table);
   }
 
   std::string getCreateTableQuery(const ddl_utils::TableType table_type,
                                   const std::string& table_name,
                                   const std::string& columns,
                                   std::map<std::string, std::string> options,
-                                  bool if_not_exists = false) {
+                                  bool if_not_exists = false,
+                                  bool or_replace = false,
+                                  bool temporary_table = false) const {
     std::string query{"CREATE "};
+    if (or_replace) {
+      query += "OR REPLACE ";
+    }
+    if (temporary_table) {
+      query += "TEMPORARY ";
+    }
     if (table_type == ddl_utils::TableType::FOREIGN_TABLE) {
       query += "FOREIGN TABLE ";
     } else {
@@ -132,7 +144,7 @@ class CreateAndDropTableDdlTest : public DBHandlerTestFixture {
     return query;
   }
 
-  std::string getTestFilePath() {
+  std::string getTestFilePath() const {
     return bf::canonical("../../Tests/FsiDataFiles/example_1.csv").string();
   }
 
@@ -166,12 +178,14 @@ class CreateTableTest : public CreateAndDropTableDdlTest,
  protected:
   void SetUp() override {
     CreateAndDropTableDdlTest::SetUp();
+    sql("DROP VIEW IF EXISTS test_view");
     sql(getDropTableQuery(GetParam(), "test_table", true));
     dropTestUser();
   }
 
   void TearDown() override {
     g_enable_fsi = true;
+    sql("DROP VIEW IF EXISTS test_view");
     sql(getDropTableQuery(GetParam(), "test_table", true));
     dropTestUser();
     CreateAndDropTableDdlTest::TearDown();
@@ -1723,6 +1737,49 @@ TEST_P(CreateTableTest, RealAlias) {
   ASSERT_EQ(cd->columnType.get_type(), kFLOAT);
 }
 
+TEST_P(CreateTableTest, CreateOrReplaceTable) {
+  auto query =
+      getCreateTableQuery(GetParam(), "test_table", "(idx INTEGER)", false, true);
+  // using a partial exception for the sake of brevity
+  if (GetParam() == ddl_utils::TableType::FOREIGN_TABLE) {
+    queryAndAssertPartialException(
+        query,
+        R"(SQL Error: Encountered "FOREIGN" at line 1, column 19.
+Was expecting:
+    "MODEL" ...)");
+  } else {
+    queryAndAssertPartialException(query,
+                                   R"(SQL Error: Encountered "TABLE" at line 1, column 19.
+Was expecting:
+    "MODEL" ...)");
+  }
+}
+
+TEST_P(CreateTableTest, CreateOrReplaceTemporaryTable) {
+  if (GetParam() == ddl_utils::TableType::FOREIGN_TABLE) {
+    GTEST_SKIP() << "Foreign tables can't be temporary.";
+  }
+  auto query =
+      getCreateTableQuery(GetParam(), "test_table", "(idx INTEGER)", false, true, true);
+  // using a partial exception for the sake of brevity
+  queryAndAssertPartialException(
+      query,
+      R"(SQL Error: Encountered "TEMPORARY" at line 1, column 19.
+Was expecting:
+    "MODEL" ...)");
+}
+
+TEST_P(CreateTableTest, CreateOrReplaceView) {
+  sql(getCreateTableQuery(GetParam(), "test_table", "(idx INTEGER)"));
+  auto query =
+      std::string("CREATE OR REPLACE VIEW test_view AS SELECT * FROM test_table");
+  // using a partial exception for the sake of brevity
+  queryAndAssertPartialException(query,
+                                 R"(SQL Error: Encountered "VIEW" at line 1, column 19.
+Was expecting:
+    "MODEL" ...)");
+}
+
 INSTANTIATE_TEST_SUITE_P(CreateAndDropTableDdlTest,
                          DropTableTest,
                          testing::Values(ddl_utils::TableType::TABLE,
@@ -2438,6 +2495,85 @@ TEST_F(CommentsBeforeCommandTest, MultiLineCommentBeforeCommand) {
       "SQL statements starting with comments are currently not allowed.");
 }
 
+class TableOptionsValidationTest : public CreateAndDropTableDdlTest {
+ protected:
+  void TearDown() override {
+    sql(getDropTableQuery(ddl_utils::TableType::TABLE, "test_table", true));
+    CreateAndDropTableDdlTest::TearDown();
+  }
+
+  std::string getCreateTableQuery(
+      const std::map<std::string, std::string>& options) const {
+    return CreateAndDropTableDdlTest::getCreateTableQuery(
+        ddl_utils::TableType::TABLE, "test_table", "(i INTEGER)", options);
+  }
+};
+
+class PageSizeValidationTest : public TableOptionsValidationTest {
+ protected:
+  std::string getCreateTableQuery(size_t page_size) const {
+    return TableOptionsValidationTest::getCreateTableQuery(
+        {{"page_size", std::to_string(page_size)}});
+  }
+};
+
+TEST_F(PageSizeValidationTest, BelowMinPageSize) {
+  const auto min_page_size = File_Namespace::FileBuffer::getMinPageSize();
+  queryAndAssertException(
+      PageSizeValidationTest::getCreateTableQuery(min_page_size - 1),
+      "page_size cannot be less than " + std::to_string(min_page_size));
+}
+
+TEST_F(PageSizeValidationTest, MinPageSize) {
+  sql(PageSizeValidationTest::getCreateTableQuery(
+      File_Namespace::FileBuffer::getMinPageSize()));
+  sql("INSERT INTO test_table VALUES (1);");
+  sqlAndCompareResult("SELECT * FROM test_table;", {{i(1)}});
+}
+
+class StringTypeErrorValidationTest : public TableOptionsValidationTest,
+                                      public testing::WithParamInterface<std::string> {};
+
+TEST_P(StringTypeErrorValidationTest, StringOptions) {
+  const auto& option_name = GetParam();
+  queryAndAssertException(getCreateTableQuery({{option_name, "0"}}),
+                          "The \"" + option_name + "\" option must be a string.");
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    OptionTypeValidation,
+    StringTypeErrorValidationTest,
+    ::testing::Values("partitions", "sort_column", "storage_type", "vacuum"));
+
+class IntTypeErrorValidationTest : public TableOptionsValidationTest,
+                                   public testing::WithParamInterface<std::string> {
+ protected:
+  std::string getCreateTableQuery(const std::map<std::string, std::string>& options,
+                                  bool include_shard_key) const {
+    return CreateAndDropTableDdlTest::getCreateTableQuery(
+        ddl_utils::TableType::TABLE,
+        "test_table",
+        "(i INTEGER"s + (include_shard_key ? ", SHARD KEY (i)" : "") + ")",
+        options);
+  }
+};
+
+TEST_P(IntTypeErrorValidationTest, IntegerOptions) {
+  const auto& option_name = GetParam();
+  queryAndAssertException(
+      getCreateTableQuery({{option_name, "'test'"}}, option_name == "shard_count"),
+      to_upper(option_name) + " must be an integer literal.");
+}
+
+INSTANTIATE_TEST_SUITE_P(OptionTypeValidation,
+                         IntTypeErrorValidationTest,
+                         ::testing::Values("fragment_size",
+                                           "max_chunk_size",
+                                           "page_size",
+                                           "max_rows",
+                                           "shard_count",
+                                           "max_rollback_epochs"));
+
 int main(int argc, char** argv) {
   g_enable_fsi = true;
   TestHelpers::init_logger_stderr_only(argc, argv);
diff --git a/Tests/CtasUpdateTest.cpp b/Tests/CtasUpdateTest.cpp
index eeb0ee0fea..64d2e323a9 100644
--- a/Tests/CtasUpdateTest.cpp
+++ b/Tests/CtasUpdateTest.cpp
@@ -3124,7 +3124,7 @@ TEST_F(Non_Kernel_Time_Interrupt, Interrupt_ITAS) {
         lockmgr::LockedTableDescriptors locks;
         db_handler->sql_execute(result, session_id, query, false, -1, -1, locks);
       } catch (const QueryExecutionError& e) {
-        if (e.getErrorCode() == Executor::ERR_INTERRUPTED) {
+        if (e.hasErrorCode(ErrorCode::INTERRUPTED)) {
           catchInterruption.store(true);
         } else if (e.getErrorCode() < 0) {
           std::cout << "Detect out of slot issue in the query output buffer while "
@@ -3235,7 +3235,7 @@ TEST_F(Non_Kernel_Time_Interrupt, Interrupt_CTAS) {
         lockmgr::LockedTableDescriptors locks;
         db_handler->sql_execute(result, session_id, query, false, -1, -1, locks);
       } catch (const QueryExecutionError& e) {
-        if (e.getErrorCode() == Executor::ERR_INTERRUPTED) {
+        if (e.hasErrorCode(ErrorCode::INTERRUPTED)) {
           catchInterruption.store(true);
         } else if (e.getErrorCode() < 0) {
           std::cout << "Detect out of slot issue in the query output buffer while "
@@ -3621,6 +3621,7 @@ class CtasTableTest : public DBHandlerTestFixture,
   void SetUp() override {
     DBHandlerTestFixture::SetUp();
     // Default connection string outside of thrift
+    ASSERT_NO_THROW(sql("drop table if exists test_table;"));
     ASSERT_NO_THROW(sql("drop table if exists ctas_test;"));
     ASSERT_NO_THROW(sql("drop table if exists ctas_test_empty;"));
     ASSERT_NO_THROW(sql("drop table if exists ctas_test_full;"));
@@ -3628,6 +3629,7 @@ class CtasTableTest : public DBHandlerTestFixture,
   }
 
   void TearDown() override {
+    ASSERT_NO_THROW(sql("drop table if exists test_table;"));
     ASSERT_NO_THROW(sql("drop table if exists ctas_test;"));
     ASSERT_NO_THROW(sql("drop table if exists ctas_test_empty;"));
     ASSERT_NO_THROW(sql("drop table if exists ctas_test_full;"));
@@ -3815,6 +3817,18 @@ TEST_F(CtasTableTest, CreateTableAsSelect) {
   // }
 }
 
+TEST_F(CtasTableTest, CreateOrReplaceCtasTable) {
+  sql("CREATE TABLE test_table (idx integer)");
+  sql("INSERT INTO test_table (idx) VALUES (1), (2), (3)");
+  auto query = std::string(
+      "CREATE OR REPLACE TABLE test_table_as_select AS SELECT * FROM test_table");
+  // using a partial exception for the sake of brevity
+  queryAndAssertPartialException(query,
+                                 R"(SQL Error: Encountered "TABLE" at line 1, column 19.
+Was expecting:
+    "MODEL" ...)");
+}
+
 class NullTextArrayTest : public DBHandlerTestFixture {
  protected:
   void SetUp() override {
diff --git a/Tests/DBObjectPrivilegesTest.cpp b/Tests/DBObjectPrivilegesTest.cpp
index 3ea5af686a..17bb9b6fd6 100644
--- a/Tests/DBObjectPrivilegesTest.cpp
+++ b/Tests/DBObjectPrivilegesTest.cpp
@@ -4722,13 +4722,32 @@ TEST(SyncUserWithRemoteProvider, IS_SUPER) {
   ASSERT_EQ(u2->isSuper, false);
 }
 
-class DropUserTest : public DBHandlerTestFixture {};
+class CreateDropUserTest : public DBHandlerTestFixture {
+  void SetUp() override {
+    DBHandlerTestFixture::SetUp();
+    sql("drop user if exists test_user");
+  }
 
-TEST_F(DropUserTest, DropAdmin) {
+  void TearDown() override {
+    sql("drop user if exists test_user");
+    DBHandlerTestFixture::TearDown();
+  }
+};
+
+TEST_F(CreateDropUserTest, DropAdmin) {
   queryAndAssertException("DROP USER admin;",
                           "Cannot drop user. User admin is required to exist.");
 }
 
+TEST_F(CreateDropUserTest, CreateOrReplaceUser) {
+  auto query = std::string("CREATE OR REPLACE USER test_user");
+  // using a partial exception for the sake of brevity
+  queryAndAssertPartialException(query,
+                                 R"(SQL Error: Encountered "USER" at line 1, column 19.
+Was expecting:
+    "MODEL" ...)");
+}
+
 class CreateDropDatabaseTest : public DBHandlerTestFixture {
  protected:
   void SetUp() override {
@@ -4791,6 +4810,16 @@ TEST_F(CreateDropDatabaseTest, LegacyOrphanedDB) {
                        {"orphan_db", "<DELETED>"}});
 }
 
+TEST_F(CreateDropDatabaseTest, CreateOrReplaceDatabase) {
+  auto query = std::string("CREATE OR REPLACE DATABASE orphan_db");
+  // using a partial exception for the sake of brevity
+  queryAndAssertPartialException(
+      query,
+      R"(SQL Error: Encountered "DATABASE" at line 1, column 19.
+Was expecting:
+    "MODEL" ...)");
+}
+
 class DatabaseCaseSensitiveTest : public DBHandlerTestFixture {
  protected:
   static void SetUpTestSuite() {
@@ -4933,6 +4962,55 @@ TEST_F(DatabaseCaseSensitiveTest, GetInternalTableDetailsForDatabase) {
   ASSERT_EQ(table_details.row_desc[1].col_name, "rowid");
 }
 
+class CreatePolicy : public DBHandlerTestFixture {
+ protected:
+  void SetUp() override {
+    DBHandlerTestFixture::SetUp();
+    sql("DROP TABLE IF EXISTS test_table");
+    sql("DROP USER IF EXISTS test_user");
+    sql("CREATE TABLE test_table(idx INTEGER)");
+    sql("CREATE USER test_user");
+  }
+
+  void TearDown() override {
+    sql("DROP TABLE IF EXISTS test_table");
+    sql("DROP USER IF EXISTS test_user");
+    DBHandlerTestFixture::TearDown();
+  }
+};
+
+TEST_F(CreatePolicy, CreateOrReplacePolicy) {
+  auto query = std::string(
+      "CREATE OR REPLACE POLICY ON COLUMN test_table.idx TO test_user VALUES(0)");
+  // using a partial exception for the sake of brevity
+  queryAndAssertPartialException(query,
+                                 R"(SQL Error: Encountered "POLICY" at line 1, column 19.
+Was expecting:
+    "MODEL" ...)");
+}
+
+class CreateRole : public DBHandlerTestFixture {
+ protected:
+  void SetUp() override {
+    DBHandlerTestFixture::SetUp();
+    sql("DROP ROLE IF EXISTS test_role");
+  }
+
+  void TearDown() override {
+    sql("DROP ROLE IF EXISTS test_role");
+    DBHandlerTestFixture::TearDown();
+  }
+};
+
+TEST_F(CreateRole, CreateOrReplaceRole) {
+  auto query = std::string("CREATE OR REPLACE ROLE test_role");
+  // using a partial exception for the sake of brevity
+  queryAndAssertPartialException(query,
+                                 R"(SQL Error: Encountered "ROLE" at line 1, column 19.
+Was expecting:
+    "MODEL" ...)");
+}
+
 int main(int argc, char* argv[]) {
   testing::InitGoogleTest(&argc, argv);
 
@@ -4950,6 +5028,7 @@ int main(int argc, char* argv[]) {
 
   logger::LogOptions log_options(argv[0]);
   log_options.max_files_ = 0;  // stderr only by default
+  log_options.set_base_path(BASE_PATH);
   desc.add(log_options.get_options());
 
   po::variables_map vm;
diff --git a/Tests/DataMgrTest.cpp b/Tests/DataMgrTest.cpp
index 5a22ac9992..51991a085d 100644
--- a/Tests/DataMgrTest.cpp
+++ b/Tests/DataMgrTest.cpp
@@ -47,6 +47,7 @@ class DataMgrTest : public testing::Test {
   virtual void resetDataMgr(size_t num_slabs = 1) {
     boost::filesystem::remove_all(data_mgr_path_);
     system_params_.max_cpu_slab_size = slab_size_;
+    system_params_.default_cpu_slab_size = system_params_.max_cpu_slab_size;
     system_params_.min_cpu_slab_size = slab_size_;
     system_params_.cpu_buffer_mem_bytes = slab_size_ * num_slabs;
 #ifdef ENABLE_MEMKIND
diff --git a/Tests/DataMgrTestHelpers.h b/Tests/DataMgrTestHelpers.h
index d0ebb6c60a..74e1739985 100644
--- a/Tests/DataMgrTestHelpers.h
+++ b/Tests/DataMgrTestHelpers.h
@@ -16,7 +16,11 @@
 
 #pragma once
 
+#include "Catalog/SysCatalog.h"
+#include "CudaMgr/CudaMgr.h"
 #include "DataMgr/AbstractBuffer.h"
+#include "DataMgr/DataMgr.h"
+#include "Shared/SystemParameters.h"
 
 namespace TestHelpers {
 
@@ -124,4 +128,11 @@ class TestBuffer : public Data_Namespace::AbstractBuffer {
   size_t reserved_size_{0};
 };
 
+void init_sys_catalog() {
+  SystemParameters sys_params;
+  auto data_mgr = std::make_shared<Data_Namespace::DataMgr>(
+      BASE_PATH "/" + shared::kDataDirectoryName, sys_params, nullptr, false);
+  auto& sys_catalog = Catalog_Namespace::SysCatalog::instance();
+  sys_catalog.init(BASE_PATH, data_mgr, {}, {}, false, false, {});
+}
 }  // namespace TestHelpers
diff --git a/Tests/DumpRestoreTest.cpp b/Tests/DumpRestoreTest.cpp
index 4d255c5c2d..bf110f881d 100644
--- a/Tests/DumpRestoreTest.cpp
+++ b/Tests/DumpRestoreTest.cpp
@@ -792,6 +792,7 @@ int main(int argc, char** argv) {
 
   logger::LogOptions log_options(argv[0]);
   log_options.max_files_ = 0;  // stderr only by default
+  log_options.set_base_path(BASE_PATH);
   desc.add(log_options.get_options());
 
   po::variables_map vm;
diff --git a/Tests/ExecuteTest.cpp b/Tests/ExecuteTest.cpp
index a011277eed..b294093137 100644
--- a/Tests/ExecuteTest.cpp
+++ b/Tests/ExecuteTest.cpp
@@ -103,6 +103,83 @@ size_t choose_shard_count() {
   return g_num_leafs * (device_count > 1 ? device_count : 1);
 }
 
+enum class ColumnDefinitionEnum : unsigned {
+  NAME = 0,  // column name
+  HEAVY,     // column definition as used in heavydb CREATE TABLE
+  SQLITE,    // column definition as used in SQLite CREATE TABLE
+  N
+};
+
+// Indexed by ColumnDefinitionEnum
+using ColumnDefinition = std::array<std::string_view, unsigned(ColumnDefinitionEnum::N)>;
+
+struct TableDefinition {
+  std::vector<ColumnDefinition> column_definitions;
+  // Add metadata as needed
+
+  // Return column definitions for e = HEAVY or SQLITE.
+  std::string columnDefinitions(ColumnDefinitionEnum const e) const {
+    std::ostringstream oss;
+    for (size_t i = 0; i < column_definitions.size(); ++i) {
+      oss << (i ? ", " : "")
+          << column_definitions[i][unsigned(ColumnDefinitionEnum::NAME)] << ' '
+          << column_definitions[i][unsigned(e)];
+    }
+    return oss.str();
+  }
+};
+
+// clang-format off
+// table name -> TableDefinition
+std::map<std::string_view, TableDefinition> const g_table_definitions =
+{
+  { "test", TableDefinition
+    { // NAME, HEAVY type, SQLITE type
+      { { "x", "int not null", "int not null" }
+      , { "w", "tinyint", "tinyint" }
+      , { "y", "int", "int" }
+      , { "z", "smallint", "smallint" }
+      , { "t", "bigint", "bigint" }
+      , { "b", "boolean", "boolean" }
+      , { "f", "float", "float" }
+      , { "ff", "float", "float" }
+      , { "fn", "float", "float" }
+      , { "d", "double", "double" }
+      , { "dn", "double", "double" }
+      , { "str", "varchar(10)", "varchar(10)" }
+      , { "null_str", "text encoding dict", "text" }
+      , { "fixed_str", "text encoding dict(16)", "text" }
+      , { "fixed_null_str", "text encoding dict(16)", "text" }
+      , { "real_str", "text encoding none", "text" }
+      , { "shared_dict", "text", "text" }
+      , { "m", "timestamp(0)", "timestamp(0)" }
+      , { "me", "timestamp(0) encoding fixed(32)", "timestamp(0)" }
+      , { "m_3", "timestamp(3)", "timestamp(3)" }
+      , { "m_6", "timestamp(6)", "timestamp(6)" }
+      , { "m_9", "timestamp(9)", "timestamp(9)" }
+      , { "n", "time(0)", "time(0)" }
+      , { "ne", "time encoding fixed(32)", "time(0)" }
+      , { "o", "date", "date" }
+      , { "o1", "date encoding fixed(16)", "date" }
+      , { "o2", "date encoding fixed(32)", "date" }
+      , { "fx", "int encoding fixed(16)", "int" }
+      , { "dd", "decimal(10, 2)", "decimal(10, 2)" }
+      , { "dd_notnull", "decimal(10, 2) not null", "decimal(10, 2) not null" }
+      , { "ss", "text encoding dict", "text" }
+      , { "u", "int", "int" }
+      , { "ofd", "int", "int" }
+      , { "ufd", "int not null", "int not null" }
+      , { "ofq", "bigint", "bigint" }
+      , { "ufq", "bigint not null", "bigint not null" }
+      , { "smallint_nulls", "smallint", "smallint" }
+      , { "bn", "boolean not null", "boolean not null" }
+      , { "num_text", "text encoding dict", "text" }
+      }
+    }
+  }
+};
+// clang-format on
+
 std::shared_ptr<ResultSet> run_multiple_agg(const string& query_str,
                                             const ExecutorDeviceType device_type,
                                             const bool allow_loop_joins) {
@@ -2117,6 +2194,15 @@ TEST_F(Select, InValues) {
   }
 }
 
+TEST_F(Select, InValuesDictEncodedStringColFromSubquery) {
+  for (auto dt : {ExecutorDeviceType::CPU, ExecutorDeviceType::GPU}) {
+    c("select count(1) from (select R.fixed_str FROM test R JOIN test_inner S ON R.x = "
+      "S.x GROUP BY 1) T1 where T1.fixed_str NOT IN (select S.str FROM test R JOIN "
+      "test_inner S ON R.x = S.x WHERE R.fixed_str != 'FOO');",
+      dt);
+  }
+}
+
 TEST_F(Select, FilterAndMultipleAggregation) {
   for (auto dt : {ExecutorDeviceType::CPU, ExecutorDeviceType::GPU}) {
     SKIP_NO_GPU();
@@ -3932,16 +4018,73 @@ INSTANTIATE_TEST_SUITE_P(
         testing::Values(kAVG, kSUM, kAPPROX_QUANTILE, kSAMPLE, kSINGLE_VALUE, kMODE)),
     AggDistinctUnsupported::testName);
 
-// Additional unit tests for APPROX_MEDIAN are in Quantile/.
-TEST_F(Select, ApproxMedianSanity) {
-  auto dt = ExecutorDeviceType::CPU;
-  auto approx_median = [dt](std::string const col) {
-    std::string const query = "SELECT APPROX_MEDIAN(" + col + ") FROM test;";
-    return v<double>(run_simple_agg(query, dt));
-  };
+struct TestParam {
+  double expected;
+  char const* column;
+};
+
+using ApproxMedianParam = std::tuple<ExecutorDeviceType, TestParam>;
+
+class ApproxMedian : public Select,
+                     public testing::WithParamInterface<ApproxMedianParam> {
+ public:
+  static void executeQueryAndAssertResult(ExecutorDeviceType const dt,
+                                          TestParam const params) {
+    auto query = "SELECT APPROX_MEDIAN(" + std::string(params.column) + ") FROM test;";
+    double actual = v<double>(run_simple_agg(query, dt));
+    EXPECT_EQ(params.expected, actual);
+  }
+
+  // Replace non-alphanumeric chars w/ underscore. Don't repeat underscores.
+  static std::string escape(std::string_view const sv) {
+    std::string escaped;
+    escaped.reserve(sv.size());
+    bool last_was_underscore = false;
+    for (char const c : sv) {
+      if (std::isalnum(c)) {
+        escaped.push_back(c);
+        last_was_underscore = false;
+      } else {
+        if (!last_was_underscore) {
+          escaped.push_back('_');
+        }
+        last_was_underscore = true;
+      }
+    }
+    if (last_was_underscore) {
+      escaped.pop_back();
+    }
+    return escaped;
+  }
+
+  // Return map of name -> column type for given table.
+  static std::map<std::string_view, std::string_view> getColType(std::string_view table) {
+    std::map<std::string_view, std::string_view> col_type;
+    for (ColumnDefinition const cd : g_table_definitions.at(table).column_definitions) {
+      col_type.emplace(cd[unsigned(ColumnDefinitionEnum::NAME)],
+                       cd[unsigned(ColumnDefinitionEnum::HEAVY)]);
+    }
+    return col_type;
+  }
+
+  // NOTE: test names must be non-empty, unique, and may only contain ASCII alphanumeric
+  // characters. In particular, they should not contain underscores*
+  // https://google.github.io/googletest/faq.html#why-should-test-suite-names-and-test-names-not-contain-underscore
+  static std::string testName(testing::TestParamInfo<ParamType> const& info) {
+    static std::map<std::string_view, std::string_view> const col_type =
+        getColType("test");
+    std::ostringstream oss;
+    char const* column_name = std::get<1>(info.param).column;
+    oss << std::get<0>(info.param) << '_' << column_name << '_'
+        << escape(col_type.at(column_name));
+    return oss.str();
+  }
+};
+
+TEST_P(ApproxMedian, AllColumnTypes) {
   if (g_aggregator) {
     try {
-      approx_median("w");
+      executeQueryAndAssertResult(std::get<0>(GetParam()), std::get<1>(GetParam()));
       EXPECT_TRUE(false) << "Exception expected for approx_median query.";
     } catch (std::runtime_error const& e) {
       EXPECT_EQ(std::string(e.what()),
@@ -3952,27 +4095,35 @@ TEST_F(Select, ApproxMedianSanity) {
       EXPECT_TRUE(false) << "std::runtime_error expected for approx_median query.";
     }
   } else {
-    EXPECT_EQ(-7.5, approx_median("w"));
-    EXPECT_EQ(7.0, approx_median("x"));
-    EXPECT_EQ(42.5, approx_median("y"));
-    EXPECT_EQ(101.0, approx_median("z"));
-    EXPECT_EQ(1001.5, approx_median("t"));
-    EXPECT_EQ((double(1.1f) + double(1.2f)) / 2, approx_median("f"));
-    EXPECT_EQ((double(1.1f) + double(101.2f)) / 2, approx_median("ff"));
-    EXPECT_EQ((double(-101.2f) + double(-1000.3f)) / 2, approx_median("fn"));
-    EXPECT_EQ(2.3, approx_median("d"));
-    EXPECT_EQ(-1111.5, approx_median("dn"));
-    EXPECT_EQ((11110.0 / 100 + 22220.0 / 100) / 2, approx_median("dd"));
-    EXPECT_EQ((11110.0 / 100 + 22220.0 / 100) / 2, approx_median("dd_notnull"));
-    EXPECT_EQ(NULL_DOUBLE, approx_median("u"));
-    EXPECT_EQ(2147483647.0, approx_median("ofd"));
-    EXPECT_EQ(-2147483647.5, approx_median("ufd"));
-    EXPECT_EQ(4611686018427387904.0, approx_median("ofq"));
-    EXPECT_EQ(-4611686018427387904.5, approx_median("ufq"));
-    EXPECT_EQ(32767.0, approx_median("smallint_nulls"));
+    executeQueryAndAssertResult(std::get<0>(GetParam()), std::get<1>(GetParam()));
   }
 }
 
+INSTANTIATE_TEST_SUITE_P(
+    Select,  // Just a name - not the class. This can be any symbol.
+    ApproxMedian,
+    testing::Combine(testing::Values(ExecutorDeviceType::CPU, ExecutorDeviceType::GPU),
+                     testing::Values(TestParam{-7.5, "w"},
+                                     TestParam{7.0, "x"},
+                                     TestParam{42.5, "y"},
+                                     TestParam{101.0, "z"},
+                                     TestParam{1001.5, "t"},
+                                     TestParam{(double(1.1f) + double(1.2f)) / 2, "f"},
+                                     TestParam{(double(1.1f) + double(101.2f)) / 2, "ff"},
+                                     TestParam{(double(-101.2f) + double(-1000.3f)) / 2,
+                                               "fn"},
+                                     TestParam{2.3, "d"},
+                                     TestParam{-1111.5, "dn"},
+                                     TestParam{(111.1 + 222.2) / 2, "dd"},
+                                     TestParam{(111.1 + 222.2) / 2, "dd_notnull"},
+                                     TestParam{NULL_DOUBLE, "u"},
+                                     TestParam{2147483647.0, "ofd"},
+                                     TestParam{-2147483647.5, "ufd"},
+                                     TestParam{4611686018427387904.0, "ofq"},
+                                     TestParam{-4611686018427387904.5, "ufq"},
+                                     TestParam{32767.0, "smallint_nulls"})),
+    ApproxMedian::testName);
+
 TEST_F(Select, ApproxMedianLargeInts) {
   if (g_aggregator) {
     LOG(WARNING) << "Skipping ApproxMedianLargeInts tests in distributed mode.";
@@ -4289,6 +4440,42 @@ TEST_F(Select, ModeOrderBy) {
   }
 }
 
+TEST_F(Select, TypeCAggregates) {
+  SKIP_ALL_ON_AGGREGATOR();  // APPROX_MEDIAN() is not supported in distributed mode.
+  for (auto dt : {ExecutorDeviceType::CPU, ExecutorDeviceType::GPU}) {
+    SKIP_NO_GPU();
+    // GROUP BY + ORDER BY aggregate
+    c("SELECT w, APPROX_MEDIAN(x), MODE(y) FROM test GROUP BY w"
+      " ORDER BY COUNT(DISTINCT z);",
+      "SELECT * FROM (VALUES (-8, 7.0, 42), (-7, 7.5, 43));",
+      dt);
+    // Non-group-by aggregate
+    c("SELECT APPROX_MEDIAN(x), COUNT(DISTINCT y), MODE(z) FROM test;",
+      "SELECT * FROM (VALUES (7.0, 2, 101));",
+      dt);
+    // Non-group-by aggregate w/ JOIN ON
+    c("SELECT APPROX_MEDIAN(test.x), MODE(test.y), COUNT(DISTINCT test.z)"
+      " FROM test JOIN test_inner ON test.y=test_inner.y;",
+      "SELECT * FROM (VALUES (7.5, 43, 2));",
+      dt);
+    // Non-group-by aggregate w/ CROSS JOIN filter
+    c("SELECT APPROX_MEDIAN(test.x), MODE(test.y), COUNT(DISTINCT test.z)"
+      " FROM test, test_inner WHERE test.y=test_inner.y;",
+      "SELECT * FROM (VALUES (7.5, 43, 2));",
+      dt);
+    // GROUP BY w/ JOIN ON
+    c("SELECT APPROX_MEDIAN(test.w), test.x, MODE(test.y), COUNT(DISTINCT test.z) FROM"
+      " test JOIN test_inner ON test.y=test_inner.y GROUP BY test.x ORDER BY test.x;",
+      "SELECT * FROM (VALUES (-7.0, 7, 43, 1), (-7.0, 8, 43, 1));",
+      dt);
+    // GROUP BY w/ CROSS JOIN filter
+    c("SELECT APPROX_MEDIAN(test.w), test.x, MODE(test.y), COUNT(DISTINCT test.z) FROM"
+      " test, test_inner WHERE test.y=test_inner.y GROUP BY test.x ORDER BY test.x;",
+      "SELECT * FROM (VALUES (-7.0, 7, 43, 1), (-7.0, 8, 43, 1));",
+      dt);
+  }
+}
+
 TEST_F(Select, ScanNoAggregation) {
   for (auto dt : {ExecutorDeviceType::CPU, ExecutorDeviceType::GPU}) {
     SKIP_NO_GPU();
@@ -5225,6 +5412,14 @@ TEST_F(Select, Strings) {
               v<int64_t>(run_simple_agg(
                   "SELECT SUM(TRY_CAST(num_text AS INT)) FROM test;", dt)));
 
+    for (std::string col_name : {"str", "fixed_str", "real_str"}) {
+      std::string common_part = " FROM test ORDER BY 1 ASC NULLS FIRST";
+      std::ostringstream oss1, oss2;
+      oss1 << "SELECT TRY_CAST(" << col_name << " AS TEXT)" << common_part;
+      oss2 << "SELECT " << col_name << common_part;
+      c(oss1.str(), oss2.str(), dt);
+    }
+
     ASSERT_EQ(static_cast<int64_t>(g_num_rows),
               v<int64_t>(run_simple_agg(
                   "SELECT COUNT(*) FROM test WHERE POSITION('foo' IN str) > 0;", dt)));
@@ -5243,6 +5438,11 @@ TEST_F(Select, Strings) {
         v<int64_t>(run_simple_agg(
             "SELECT COUNT(*) FROM test WHERE str || ' ' || real_str = 'foo real_foo';",
             dt))));
+
+    EXPECT_EQ(v<int64_t>(run_simple_agg("SELECT COUNT(*) FROM (SELECT ENCODE_TEXT(NULL) "
+                                        "v FROM test) WHERE v IS NULL;",
+                                        dt)),
+              v<int64_t>(run_simple_agg("SELECT COUNT(*) FROM test", dt)));
   }
 }
 
@@ -5286,6 +5486,32 @@ TEST_F(Select, NotILikeNoneEncodedTextWithParenthesis) {
   }
 }
 
+TEST_F(Select, CanReuseCompiledCodeForStringPatternMatchQuery) {
+  SKIP_ALL_ON_AGGREGATOR();
+  CodeCacheMetric q1_cache_metric, q2_cache_metric;
+  auto qe_instance = QueryEngine::getInstance();
+  for (auto dt : {ExecutorDeviceType::CPU, ExecutorDeviceType::GPU}) {
+    SKIP_NO_GPU();
+    ASSERT_EQ(10,
+              v<int64_t>(run_simple_agg(
+                  "SELECT COUNT(1) FROM TEST WHERE shared_dict LIKE '%foo';", dt)));
+    if (dt == ExecutorDeviceType::CPU) {
+      q1_cache_metric = qe_instance->cpu_code_accessor->getCodeCacheMetric();
+    } else {
+      q1_cache_metric = qe_instance->gpu_code_accessor->getCodeCacheMetric();
+    }
+    ASSERT_EQ(5,
+              v<int64_t>(run_simple_agg(
+                  "SELECT COUNT(1) FROM TEST WHERE shared_dict LIKE '%baz';", dt)));
+    if (dt == ExecutorDeviceType::CPU) {
+      q2_cache_metric = qe_instance->cpu_code_accessor->getCodeCacheMetric();
+    } else {
+      q2_cache_metric = qe_instance->gpu_code_accessor->getCodeCacheMetric();
+    }
+    ASSERT_GT(q2_cache_metric.found_count, q1_cache_metric.found_count);
+  }
+}
+
 TEST_F(Select, SharedDictionary) {
   for (auto dt : {ExecutorDeviceType::CPU, ExecutorDeviceType::GPU}) {
     SKIP_NO_GPU();
@@ -5521,16 +5747,13 @@ TEST_F(Select, DictionaryStringEquality) {
   // execute between two text columns even when they do not share
   // dictionaries, with watchdog both on and off and without punting
   // to CPU
-  const auto watchdog_state = g_enable_watchdog;
-  const auto cpu_retry_state = g_allow_cpu_retry;
-  const auto cpu_step_retry_state = g_allow_query_step_cpu_retry;
-
-  ScopeGuard reset_global_state =
-      [&watchdog_state, &cpu_retry_state, &cpu_step_retry_state] {
-        g_enable_watchdog = watchdog_state;
-        g_allow_cpu_retry = cpu_retry_state;
-        g_allow_query_step_cpu_retry = cpu_step_retry_state;
-      };
+  ScopeGuard reset_global_state = [watchdog_state = g_enable_watchdog,
+                                   cpu_retry_state = g_allow_cpu_retry,
+                                   cpu_step_retry_state = g_allow_query_step_cpu_retry] {
+    g_enable_watchdog = watchdog_state;
+    g_allow_cpu_retry = cpu_retry_state;
+    g_allow_query_step_cpu_retry = cpu_step_retry_state;
+  };
 
   g_allow_cpu_retry = false;
   g_allow_query_step_cpu_retry = false;
@@ -5551,6 +5774,30 @@ TEST_F(Select, DictionaryStringEquality) {
   }
 }
 
+TEST_F(Select, DictionaryStringNonEquality) {
+  ScopeGuard reset_global_state = [watchdog_state = g_enable_watchdog,
+                                   cpu_retry_state = g_allow_cpu_retry,
+                                   cpu_step_retry_state = g_allow_query_step_cpu_retry] {
+    g_enable_watchdog = watchdog_state;
+    g_allow_cpu_retry = cpu_retry_state;
+    g_allow_query_step_cpu_retry = cpu_step_retry_state;
+  };
+
+  g_allow_cpu_retry = false;
+  g_allow_query_step_cpu_retry = true;
+
+  for (auto enable_watchdog : {true, false}) {
+    g_enable_watchdog = enable_watchdog;
+    for (auto dt : {ExecutorDeviceType::CPU, ExecutorDeviceType::GPU}) {
+      SKIP_NO_GPU();
+      THROW_ON_AGGREGATOR(c("SELECT COUNT(*) FROM test WHERE null_str > real_str", dt));
+      THROW_ON_AGGREGATOR(c("SELECT COUNT(*) FROM test WHERE null_str >= real_str", dt));
+      THROW_ON_AGGREGATOR(c("SELECT COUNT(*) FROM test WHERE null_str < real_str", dt));
+      THROW_ON_AGGREGATOR(c("SELECT COUNT(*) FROM test WHERE null_str <= real_str", dt));
+    }
+  }
+}
+
 void prepare_inserts_test_table() {
   // Test using a string of length 100,000 chars.
   // Each number is represented as 1 space ' ' + 9 zero-padded integers
@@ -22113,6 +22360,17 @@ TEST_F(Select, WindowFunctionEmptyPartitions) {
           " GROUP BY DATE_TRUNC(DAY, d), x ORDER BY DATE_TRUNC(DAY, d) NULLS FIRST;";
       EXPECT_NO_THROW(run_multiple_agg(query, dt));
     }
+
+    {
+      std::string query =
+          "SELECT DATE_TRUNC(DAY, d) AS binned_day, COUNT(*) AS n, SUM(x) AS sum_x, "
+          "COUNT(*) - LAG(COUNT(*)) OVER ( ORDER BY DATE_TRUNC(DAY, d) ) AS "
+          "lag_n_order_by_d, SUM(x) / SUM(SUM(x+1)) OVER ( ORDER BY DATE_TRUNC(DAY, d)) "
+          "AS sum_over_lag_sum_x FROM " +
+          table_name +
+          " GROUP BY DATE_TRUNC(DAY, d) ORDER BY DATE_TRUNC(DAY, d) NULLS FIRST;";
+      EXPECT_NO_THROW(run_multiple_agg(query, dt));
+    }
   }
 }
 
@@ -22339,6 +22597,16 @@ TEST_F(Select, WindowFunctionLag) {
         }
       }
     }
+
+    {
+      std::string q1 = "SELECT LAG(COUNT(*), 1) OVER (ORDER BY x ASC NULLS FIRST) FROM " +
+                       table_name + " GROUP BY x ORDER BY x ASC NULLS FIRST;";
+      c(q1, q1, dt);
+      std::string q2 =
+          "SELECT COUNT(*) - LAG(COUNT(*), 1) OVER (ORDER BY x ASC NULLS FIRST) FROM " +
+          table_name + " GROUP BY x ORDER BY x ASC NULLS FIRST;";
+      c(q2, q2, dt);
+    }
   }
 }
 
@@ -22443,6 +22711,16 @@ TEST_F(Select, WindowFunctionLead) {
         }
       }
     }
+    {
+      std::string q1 =
+          "SELECT LEAD(COUNT(*), 1) OVER (ORDER BY x ASC NULLS FIRST) FROM " +
+          table_name + " GROUP BY x ORDER BY x ASC NULLS FIRST;";
+      c(q1, q1, dt);
+      std::string q2 =
+          "SELECT COUNT(*) - LEAD(COUNT(*), 1) OVER (ORDER BY x ASC NULLS FIRST) FROM " +
+          table_name + " GROUP BY x ORDER BY x ASC NULLS FIRST;";
+      c(q2, q2, dt);
+    }
   }
 }
 
@@ -22827,6 +23105,13 @@ TEST_F(Select, WindowFunctionSum) {
           table_name + ")) ORDER BY total ASC NULLS FIRST";
       c(query, query, dt);
     }
+    {
+      std::string query =
+          "SELECT x, COUNT(*) - SUM(COUNT(*)) OVER (ORDER BY COUNT(*) DESC, x NULLS "
+          "FIRST) FROM " +
+          table_name + " GROUP BY x ORDER BY COUNT(*) DESC, x NULLS FIRST;";
+      c(query, query, dt);
+    }
   }
 }
 
@@ -23019,6 +23304,105 @@ TEST_F(Select, WindowFunctionNested) {
     dt);
 }
 
+class WindowFunctionLiteralArg : public ::testing::Test {
+ public:
+  struct Test {
+    SqlWindowFunctionKind window_func_kind{SqlWindowFunctionKind::UNKNOWN};
+    size_t num_args{0};
+    bool throw_exception{false};
+    std::optional<int64_t> expected_res{std::nullopt};
+  };
+  using Param = std::tuple<ExecutorDeviceType, Test>;
+  static std::string testName(testing::TestParamInfo<Param> const& info) {
+    std::ostringstream oss;
+    auto const window_function_kind = std::get<1>(info.param).window_func_kind;
+    CHECK_NE(window_function_kind, SqlWindowFunctionKind::UNKNOWN);
+    oss << std::get<0>(info.param) << '_' << window_function_kind;
+    return oss.str();
+  }
+};
+
+std::ostream& operator<<(std::ostream& os, WindowFunctionLiteralArg::Test const& test) {
+  std::string res_str =
+      test.expected_res.has_value() ? std::to_string(*test.expected_res) : "N/A";
+  return os << test.window_func_kind << ", " << test.num_args << ", "
+            << std::to_string(test.throw_exception) << ", " << res_str;
+}
+
+class WindowFunctionLiteralArgTest
+    : public WindowFunctionLiteralArg,
+      public testing::WithParamInterface<WindowFunctionLiteralArg::Param> {
+ public:
+  static void performTest(ExecutorDeviceType const dt, Test const& test) {
+    std::ostringstream oss;
+    oss << "SELECT " << test.window_func_kind << "(";
+    if (test.num_args > 0) {
+      std::vector<std::string> args_vec(test.num_args, "2");
+      oss << boost::join(args_vec, ",");
+    }
+    oss << ") OVER () FROM test ORDER BY 1 ASC LIMIT 1";
+    auto const query = oss.str();
+    if (test.throw_exception) {
+      EXPECT_ANY_THROW(run_multiple_agg(query, dt));
+    } else {
+      CHECK(test.expected_res.has_value());
+      if (test.window_func_kind == SqlWindowFunctionKind::AVG) {
+        EXPECT_EQ(static_cast<double>(*test.expected_res),
+                  v<double>(run_simple_agg(query, dt)))
+            << query;
+      } else {
+        EXPECT_EQ(*test.expected_res, v<int64_t>(run_simple_agg(query, dt))) << query;
+      }
+    }
+  }
+};
+
+TEST_P(WindowFunctionLiteralArgTest, Test) {
+  SKIP_ALL_ON_AGGREGATOR();
+  auto const [dt, test_args] = GetParam();
+  WindowFunctionLiteralArgTest::performTest(dt, test_args);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    Select,
+    WindowFunctionLiteralArgTest,
+    testing::Combine(
+        testing::Values(ExecutorDeviceType::CPU, ExecutorDeviceType::GPU),
+        testing::Values(
+            WindowFunctionLiteralArg::Test{SqlWindowFunctionKind::ROW_NUMBER,
+                                           0,
+                                           false,
+                                           1},
+            WindowFunctionLiteralArg::Test{SqlWindowFunctionKind::MIN, 1, false, 2},
+            WindowFunctionLiteralArg::Test{SqlWindowFunctionKind::MAX, 1, false, 2},
+            WindowFunctionLiteralArg::Test{SqlWindowFunctionKind::AVG, 1, false, 2},
+            WindowFunctionLiteralArg::Test{SqlWindowFunctionKind::SUM, 1, false, 40},
+            WindowFunctionLiteralArg::Test{SqlWindowFunctionKind::COUNT_IF, 1, false, 20},
+            WindowFunctionLiteralArg::Test{SqlWindowFunctionKind::COUNT, 1, false, 20},
+            WindowFunctionLiteralArg::Test{SqlWindowFunctionKind::NTILE, 1, false, 1},
+            WindowFunctionLiteralArg::Test{SqlWindowFunctionKind::LAG, 1, false, 2},
+            WindowFunctionLiteralArg::Test{SqlWindowFunctionKind::LEAD, 1, false, 2},
+            WindowFunctionLiteralArg::Test{SqlWindowFunctionKind::FIRST_VALUE,
+                                           1,
+                                           false,
+                                           2},
+            WindowFunctionLiteralArg::Test{SqlWindowFunctionKind::LAST_VALUE,
+                                           1,
+                                           false,
+                                           2},
+            WindowFunctionLiteralArg::Test{SqlWindowFunctionKind::FORWARD_FILL, 1, true},
+            WindowFunctionLiteralArg::Test{SqlWindowFunctionKind::BACKWARD_FILL, 1, true},
+            WindowFunctionLiteralArg::Test{
+                SqlWindowFunctionKind::CONDITIONAL_CHANGE_EVENT,
+                1,
+                true},
+            WindowFunctionLiteralArg::Test{SqlWindowFunctionKind::NTH_VALUE, 2, false, 2},
+            WindowFunctionLiteralArg::Test{SqlWindowFunctionKind::LAG_IN_FRAME, 2, true},
+            WindowFunctionLiteralArg::Test{SqlWindowFunctionKind::LEAD_IN_FRAME,
+                                           2,
+                                           true})),
+    WindowFunctionLiteralArg::testName);
+
 TEST_F(Select, WindowFunctionFraming) {
   const ExecutorDeviceType dt = ExecutorDeviceType::CPU;
   // to make a stable test result, we use a table having non-peer row
@@ -25618,6 +26002,43 @@ TEST_F(Select, ResultsetAndChunkMetadataRecycling) {
   clearCache();
 }
 
+TEST_F(Select, ChunkMetadataCacheFromSubQuery) {
+  SKIP_ALL_ON_AGGREGATOR();
+  SKIP_WITH_TEMP_TABLES();
+
+  ScopeGuard reset_global_flag_state = [orig_data_recycler = g_enable_data_recycler,
+                                        orig_chunk_metadata_recycler =
+                                            g_use_chunk_metadata_cache] {
+    g_enable_data_recycler = orig_data_recycler;
+    g_use_chunk_metadata_cache = orig_chunk_metadata_recycler;
+  };
+  g_enable_data_recycler = true;
+  g_use_chunk_metadata_cache = true;
+
+  auto executor = Executor::getExecutor(Executor::UNITARY_EXECUTOR_ID).get();
+  auto& recycler_holder = executor->getResultSetRecyclerHolder();
+  auto chunk_metadata_recycler = recycler_holder.getChunkMetadataRecycler();
+  CHECK(chunk_metadata_recycler);
+
+  auto clearCache = [&executor] {
+    executor->clearMemory(MemoryLevel::CPU_LEVEL);
+    executor->getQueryPlanDagCache().clearQueryPlanCache();
+  };
+  clearCache();
+
+  for (auto dt : {ExecutorDeviceType::CPU, ExecutorDeviceType::GPU}) {
+    SKIP_NO_GPU();
+    // check if we can recycle chunk metadata of the subquery
+    c("select R.x, R.y, S.x, S.y, sum(R.xx) from test_inner R, test S where R.x in "
+      "(select x from test group by x having sum(w) > 0) group by 1, 2, 3, 4",
+      dt);
+    EXPECT_GT(chunk_metadata_recycler->getCurrentNumCachedItems(
+                  CacheItemType::CHUNK_METADATA, DataRecyclerUtil::CPU_DEVICE_IDENTIFIER),
+              static_cast<size_t>(0));
+    clearCache();
+  }
+}
+
 TEST_F(Select, QueryStepSkipping) {
   SKIP_ALL_ON_AGGREGATOR();
   SKIP_WITH_TEMP_TABLES();
@@ -26104,6 +26525,133 @@ TEST_F(Select, RemoveFromQuerySessionList) {
   }
 }
 
+TEST_F(Select, OstensibleTautologyPredicate) {
+  for (auto dt : {ExecutorDeviceType::CPU, ExecutorDeviceType::GPU}) {
+    SKIP_NO_GPU();
+
+    const auto result =
+        run_multiple_agg("SELECT COUNT_IF(ofd IS NULL), COUNT(*) FROM test;", dt);
+    ASSERT_EQ(result->rowCount(), size_t(1));
+
+    const auto row = result->getNextRow(false, false);
+    ASSERT_EQ(row.size(), size_t(2));
+
+    const auto null_count = v<int64_t>(row[0]);
+    EXPECT_GT(null_count, 0);
+
+    const auto total_row_count = v<int64_t>(row[1]);
+    EXPECT_GT(total_row_count, 0);
+
+    const auto row_count =
+        v<int64_t>(run_simple_agg("SELECT COUNT(*) FROM test WHERE ofd = ofd;", dt));
+    EXPECT_EQ(row_count, total_row_count - null_count);
+  }
+}
+
+TEST_F(Select, ComplexQueryWithEmptyStringLiteral) {
+  // this test is to check the regex matching functionality of
+  // our find_string_literals function which previously had an issue
+  // of throwing infinite exception while handling the following query pattern
+  SKIP_ALL_ON_AGGREGATOR();
+  std::string tbl1_ddl{
+      "CREATE TABLE aaa_aaaa_aa_aaaaaaaaa_aa_a2 (\n"
+      "aaa_aa DATE ENCODING DAYS(32),\n"
+      "bbbbbb_bb DATE ENCODING DAYS(32),\n"
+      "cccc_ccc DATE ENCODING DAYS(32),\n"
+      "ddd_ddd TEXT ENCODING DICT(32),\n"
+      "eee_eee TEXT ENCODING DICT(32),\n"
+      "fffff TEXT ENCODING DICT(32),\n"
+      "gggg TEXT ENCODING DICT(32),\n"
+      "gggg_ggggg TEXT ENCODING DICT(32),\n"
+      "hhhhhhhh TEXT ENCODING DICT(32),\n"
+      "hhhhhhhh_hhhhhhh TEXT ENCODING DICT(32),\n"
+      "iiiiiiiiiiii TEXT ENCODING DICT(32),\n"
+      "iiiiiiiiiiii_iiiiiii TEXT ENCODING DICT(32),\n"
+      "jj TEXT ENCODING DICT(32),\n"
+      "kkkkkkk_kkkkk_kkkk_kkkkk TEXT ENCODING DICT(32),\n"
+      "ll_lll_llllllll TEXT ENCODING DICT(32),\n"
+      "mmmmmmm_mmm TEXT ENCODING DICT(32),\n"
+      "ooo_ooo_oooooo TEXT ENCODING DICT(32),\n"
+      "pppp_ppppppp_pppppppppp_ppp TEXT ENCODING DICT(32),\n"
+      "qqqqqqqqqq_qqqqqq_qqq TEXT ENCODING DICT(32),\n"
+      "rrrrrr_rrr_rrr INTEGER,\n"
+      "sssssss TEXT ENCODING DICT(32),\n"
+      "ttttttt_ttttttt_tttt TEXT ENCODING DICT(32),\n"
+      "vvvvvv_vvvvv_vvvvvv TEXT ENCODING DICT(32),\n"
+      "xxxxx_xxxxx INTEGER,\n"
+      "yyy_yyyyy BIGINT,\n"
+      "zzzz_zzzzz BIGINT,\n"
+      "ababa_bab DOUBLE,\n"
+      "cdcdc_dcd BIGINT);"};
+  std::string tbl2_ddl{
+      "CREATE TABLE bbb_bb_bb_bbbbbbbbb (\n"
+      "ef_ef_efe TEXT ENCODING DICT(16),\n"
+      "ghghg_hghgh TEXT ENCODING DICT(8),\n"
+      "ikikik_ik_ikikiki TEXT ENCODING DICT(8),\n"
+      "ll_lll_llllllll TEXT ENCODING DICT(8),\n"
+      "qwqwqwq TEXT ENCODING DICT(8),\n"
+      "er_er TEXT ENCODING DICT(16));"};
+  run_ddl_statement("DROP TABLE IF EXISTS aaa_aaaa_aa_aaaaaaaaa_aa_a2;");
+  run_ddl_statement("DROP TABLE IF EXISTS bbb_bb_bb_bbbbbbbbb;");
+  run_ddl_statement(tbl1_ddl);
+  run_ddl_statement(tbl2_ddl);
+  std::string query(
+      "SELECT(CASE WHEN ((CASE WHEN (\"aaa_aaaa_aa_aaaaaaaaa_aa_a21\".\"jj\" = "
+      "'AAAAAAAAAAA BBBBBBBBBBBB CCC. - DDDDD 2 EEEEE') THEN 'FFF-EEEEE' WHEN "
+      "(\"aaa_aaaa_aa_aaaaaaaaa_aa_a21\".\"pppp_ppppppp_pppppppppp_ppp\" = 'QWEQWE') "
+      "THEN 'FEF CXZCXZC' WHEN ((CAST(\"aaa_aaaa_aa_aaaaaaaaa_aa_a21\".\"aaa_aa\" AS "
+      "DATE) >= CAST('2022-08-01' AS DATE)) AND "
+      "(\"aaa_aaaa_aa_aaaaaaaaa_aa_a21\".\"pppp_ppppppp_pppppppppp_ppp\" = 'FFF-EEEEE')) "
+      "THEN 'FFF-ZAXAXA' WHEN ((CAST(\"aaa_aaaa_aa_aaaaaaaaa_aa_a21\".\"aaa_aa\" AS "
+      "DATE) >= CAST('2022-08-01' AS DATE)) AND (\"bbb_bb_bb_bbbbbbbbb1\".\"er_er\" = "
+      "'CCC WWW. - FFFFF')) THEN 'J6W-EEEEE' ELSE "
+      "\"aaa_aaaa_aa_aaaaaaaaa_aa_a21\".\"pppp_ppppppp_pppppppppp_ppp\" END) <> '') THEN "
+      "'ZAXAXA' ELSE (CASE WHEN (\"aaa_aaaa_aa_aaaaaaaaa_aa_a21\".\"ddd_ddd\" = 'QWE') "
+      "THEN 'QWE' WHEN ((\"aaa_aaaa_aa_aaaaaaaaa_aa_a21\".\"ddd_ddd\" = 'QWERTTYU') AND "
+      "(\"aaa_aaaa_aa_aaaaaaaaa_aa_a21\".\"eee_eee\" = 'ASD')) THEN 'ASD' WHEN "
+      "((\"aaa_aaaa_aa_aaaaaaaaa_aa_a21\".\"ddd_ddd\" = 'QWERTTYU') AND "
+      "COALESCE((\"aaa_aaaa_aa_aaaaaaaaa_aa_a21\".\"eee_eee\" IN ('CC-GRTGTRGTRG', "
+      "'CC-YTRYTRYTRYT', 'CC-RYRYRYR YRYRYRY', 'BB-QWEQWEQWEQWER & REWREWREWR', 'ZZ-AAA "
+      "ACCACCAC & ACACACACACA', 'CA-CACACAC', 'CA-CACACACAC CACACACA')), FALSE)) THEN "
+      "'CC' WHEN ((\"aaa_aaaa_aa_aaaaaaaaa_aa_a21\".\"ddd_ddd\" = 'QWERTTYU') AND "
+      "(COALESCE((\"aaa_aaaa_aa_aaaaaaaaa_aa_a21\".\"eee_eee\" IN ('AMBULANT', 'IA')), "
+      "FALSE) OR (\"aaa_aaaa_aa_aaaaaaaaa_aa_a21\".\"eee_eee\" IS NULL))) THEN "
+      "'ABABABAB' ELSE 'IYUITG' END) END) AS \"FDFDFDFDFDF_5123970507839131648\", "
+      "SUM((CASE WHEN (\"aaa_aaaa_aa_aaaaaaaaa_aa_a21\".\"xxxxx_xxxxx\" = 1) THEN 0 ELSE "
+      "\"aaa_aaaa_aa_aaaaaaaaa_aa_a21\".\"ababa_bab\" END)) AS \"NBNB(NBNBNBN BNBNB "
+      "NBNBN BNBNBN (NBNBN) (copy)_61080138207346689)(480704438)(0)\", SUM((CASE WHEN "
+      "((CASE WHEN (\"aaa_aaaa_aa_aaaaaaaaa_aa_a21\".\"ddd_ddd\" IN ('EWQEWQ BVBVBVB', "
+      "'TGTGTGTG', 'NBNBN BNBNB')) THEN 'IYUITG' ELSE "
+      "\"aaa_aaaa_aa_aaaaaaaaa_aa_a21\".\"ddd_ddd\" END) = 'QWERTTYU') THEN "
+      "\"aaa_aaaa_aa_aaaaaaaaa_aa_a21\".\"ababa_bab\" ELSE 0 END)) AS "
+      "\"TEMP(Calculation_114771454018420739)(4174943029)(0)\", "
+      "MAX(\"aaa_aaaa_aa_aaaaaaaaa_aa_a21\".\"aaa_aa\") AS "
+      "\"ZAZA(HJHJHJHJH_2940991320156753920)(166123433)(0)\", "
+      "SUM(\"aaa_aaaa_aa_aaaaaaaaa_aa_a21\".\"ababa_bab\") AS "
+      "\"ZAZA(HJHJHJHJH_2940991320156753920)(3328816748)(0)\", "
+      "MIN(\"aaa_aaaa_aa_aaaaaaaaa_aa_a21\".\"aaa_aa\") AS "
+      "\"ZAZA(HJHJHJHJH_2940991320156753920)(784626528)(0)\", "
+      "\"aaa_aaaa_aa_aaaaaaaaa_aa_a21\".\"kkkkkkk_kkkkk_kkkk_kkkkk\" AS "
+      "\"kkkkkkk_kkkkk_kkkk_kkkkk\", \"aaa_aaaa_aa_aaaaaaaaa_aa_a21\".\"iiiiiiiiiiii\" "
+      "AS \"iiiiiiiiiiii\", \"aaa_aaaa_aa_aaaaaaaaa_aa_a21\".\"hhhhhhhh\" AS "
+      "\"hhhhhhhh\", DATE_TRUNC( MONTH, \"aaa_aaaa_aa_aaaaaaaaa_aa_a21\".\"aaa_aa\" ) AS "
+      "\"tmn:aaa_aa:ok\" FROM \"aaa_aaaa_aa_aaaaaaaaa_aa_a2\" "
+      "\"aaa_aaaa_aa_aaaaaaaaa_aa_a21\" LEFT JOIN \"bbb_bb_bb_bbbbbbbbb\" "
+      "\"bbb_bb_bb_bbbbbbbbb1\" ON (\"aaa_aaaa_aa_aaaaaaaaa_aa_a21\".\"jj\" = "
+      "\"bbb_bb_bb_bbbbbbbbb1\".\"ef_ef_efe\") WHERE "
+      "(((\"aaa_aaaa_aa_aaaaaaaaa_aa_a21\".\"kkkkkkk_kkkkk_kkkk_kkkkk\" IN ('CE VIS', "
+      "'MIN', 'W VIS')) AND (CASE WHEN ((\"aaa_aaaa_aa_aaaaaaaaa_aa_a21\".\"fffff\" IN "
+      "('PHW_BRO_PRE')) OR (\"aaa_aaaa_aa_aaaaaaaaa_aa_a21\".\"fffff\" IS NULL)) THEN "
+      "FALSE ELSE TRUE END)) AND ((DATE_TRUNC( MONTH, "
+      "\"aaa_aaaa_aa_aaaaaaaaa_aa_a21\".\"aaa_aa\" ) >= (TIMESTAMP '2022-01-01 "
+      "00:00:00.000')) AND (DATE_TRUNC( MONTH, "
+      "\"aaa_aaaa_aa_aaaaaaaaa_aa_a21\".\"aaa_aa\" ) <= (TIMESTAMP '2023-09-01 "
+      "00:00:00.000')))) GROUP BY 1,7,8,9,10 LIMIT 100000;");
+  EXPECT_NO_THROW(run_multiple_agg(query, ExecutorDeviceType::CPU));
+  run_ddl_statement("DROP TABLE IF EXISTS aaa_aaaa_aa_aaaaaaaaa_aa_a2;");
+  run_ddl_statement("DROP TABLE IF EXISTS bbb_bb_bb_bbbbbbbbb;");
+}
+
 class DateAndTimeFunctionsTest : public QRExecutorDeviceParamTest {};
 
 TEST_P(DateAndTimeFunctionsTest, CastLiteralToDate) {
@@ -28117,25 +28665,9 @@ int create_and_populate_tables(const bool use_temporary_tables,
     const std::string drop_old_test{"DROP TABLE IF EXISTS test;"};
     run_ddl_statement(drop_old_test);
     g_sqlite_comparator.query(drop_old_test);
-    std::string columns_definition{
-        "x int not null, w tinyint, y int, z smallint, t bigint, b boolean, f float, "
-        "ff "
-        "float, fn "
-        "float, d double, dn double, str "
-        "varchar(10), null_str text encoding dict, fixed_str text encoding dict(16), "
-        "fixed_null_str text encoding "
-        "dict(16), real_str text encoding none, shared_dict text, m timestamp(0), me "
-        "timestamp(0) encoding fixed(32), m_3 "
-        "timestamp(3), m_6 timestamp(6), "
-        "m_9 timestamp(9), n time(0), ne time encoding fixed(32), o date, o1 date "
-        "encoding fixed(16), o2 date "
-        "encoding fixed(32), fx int "
-        "encoding fixed(16), dd decimal(10, 2), dd_notnull decimal(10, 2) not null, ss "
-        "text encoding dict, u int, ofd "
-        "int, ufd int not null, ofq bigint, ufq bigint not null, smallint_nulls "
-        "smallint, bn boolean not null, num_text text encoding dict"};
+    TableDefinition const& test_definition = g_table_definitions.at("test");
     const std::string create_test = build_create_table_statement(
-        columns_definition,
+        test_definition.columnDefinitions(ColumnDefinitionEnum::HEAVY),
         "test",
         {g_shard_count ? "str" : "", g_shard_count},
         {{"str", "test_inner", "str"}, {"shared_dict", "test", "str"}},
@@ -28144,20 +28676,9 @@ int create_and_populate_tables(const bool use_temporary_tables,
         with_delete_support);
     run_ddl_statement(create_test);
     g_sqlite_comparator.query(
-        "CREATE TABLE test(x int not null, w tinyint, y int, z smallint, t bigint, b "
-        "boolean, f "
-        "float, ff float, fn float, d "
-        "double, dn double, str varchar(10), null_str text, fixed_str text, "
-        "fixed_null_str text, real_str text, "
-        "shared_dict "
-        "text, m timestamp(0), me timestamp(0), m_3 timestamp(3), m_6 timestamp(6), "
-        "m_9 "
-        "timestamp(9), n "
-        "time(0), ne time(0), o date, o1 date, o2 date, "
-        "fx int, dd decimal(10, 2), dd_notnull decimal(10, 2) not "
-        "null, ss "
-        "text, u int, ofd int, ufd int not null, ofq bigint, ufq bigint not null, "
-        "smallint_nulls smallint, bn boolean not null, num_text text);");
+        "CREATE TABLE test(" +
+        test_definition.columnDefinitions(ColumnDefinitionEnum::SQLITE) + ");");
+
   } catch (...) {
     LOG(ERROR) << "Failed to (re-)create table 'test'";
     return -EEXIST;
@@ -29194,6 +29715,7 @@ int main(int argc, char** argv) {
 
   logger::LogOptions log_options(argv[0]);
   log_options.severity_ = logger::Severity::FATAL;
+  log_options.set_base_path(BASE_PATH);
   log_options.set_options();  // update default values
   desc.add(log_options.get_options());
 
diff --git a/Tests/ForeignServerDdlTest.cpp b/Tests/ForeignServerDdlTest.cpp
index 4f83e11b1c..e81f0aa629 100644
--- a/Tests/ForeignServerDdlTest.cpp
+++ b/Tests/ForeignServerDdlTest.cpp
@@ -184,6 +184,17 @@ TEST_F(CreateForeignServerTest, MissingWithClause) {
   queryAndAssertException(query, "Foreign server options must contain \"STORAGE_TYPE\".");
 }
 
+TEST_F(CreateForeignServerTest, CreateOrReplaceServer) {
+  std::string query{
+      "CREATE OR REPLACE SERVER test_server FOREIGN DATA WRAPPER delimited_file "
+      "WITH (storage_type = 'LOCAL_FILE', base_path = '/test_path/');"};
+  // using a partial exception for the sake of brevity
+  queryAndAssertPartialException(query,
+                                 R"(SQL Error: Encountered "SERVER" at line 1, column 19.
+Was expecting:
+    "MODEL" ...)");
+}
+
 class ReservedServerNamePrefixTest : public DBHandlerTestFixture,
                                      public ::testing::WithParamInterface<std::string> {};
 
diff --git a/Tests/GeospatialJoinTest.cpp b/Tests/GeospatialJoinTest.cpp
index 15789aebd9..10a31e2161 100644
--- a/Tests/GeospatialJoinTest.cpp
+++ b/Tests/GeospatialJoinTest.cpp
@@ -24,6 +24,7 @@
 
 #include "QueryEngine/ArrowResultSet.h"
 #include "QueryEngine/Execute.h"
+#include "QueryEngine/JoinHashTable/BoundingBoxIntersectJoinHashTable.h"
 #include "Shared/scope.h"
 #include "TestHelpers.h"
 
@@ -66,8 +67,10 @@ struct ExecutionContext {
 };
 
 template <typename TEST_BODY>
-void executeAllScenarios(TEST_BODY fn) {
-  for (const auto bbox_intersect_state : {true, false}) {
+void executeAllScenarios(TEST_BODY fn,
+                         std::initializer_list<bool> bbox_intersect_states = {true,
+                                                                              false}) {
+  for (const auto bbox_intersect_state : bbox_intersect_states) {
     const auto enable_bbox_intersect_hashjoin = g_enable_bbox_intersect_hashjoin;
     const auto enable_hashjoin_many_to_many_state = g_enable_hashjoin_many_to_many;
 
@@ -299,6 +302,33 @@ TEST_F(GeospatialJoinTest, InnerJoinPolyInPointIntersects) {
   });
 }
 
+TEST_F(GeospatialJoinTest, InnerHashJoinPolyInPointIntersects) {
+  ASSERT_EQ(static_cast<int64_t>(3),
+            v<int64_t>(execSQL("SELECT COUNT(*) FROM does_intersect_a",
+                               ExecutorDeviceType::CPU)));
+  ASSERT_EQ(static_cast<int64_t>(2),
+            v<int64_t>(execSQL("SELECT COUNT(*) FROM does_intersect_b",
+                               ExecutorDeviceType::CPU)));
+  ScopeGuard reset = [orig = g_from_table_reordering] { g_from_table_reordering = orig; };
+  g_from_table_reordering = true;
+  executeAllScenarios([](const ExecutionContext ctx) -> void {
+    for (std::string geometry : {"poly", "mpoly"}) {
+      for (bool param : {true, false}) {
+        std::ostringstream oss;
+        oss << "SELECT COUNT(1) FROM does_intersect_a R, does_intersect_b S WHERE "
+               "ST_INTERSECTS(";
+        if (param) {
+          oss << "R." << geometry << ", S.pt);";
+        } else {
+          oss << "S.pt, R." << geometry << ");";
+        }
+        ASSERT_EQ(static_cast<int64_t>(3),
+                  v<int64_t>(execSQL(oss.str(), ctx.device_type)));
+      }
+    }
+  });
+}
+
 TEST_F(GeospatialJoinTest, InnerJoinPolyPolyIntersects) {
   executeAllScenarios([](const ExecutionContext ctx) -> void {
     auto sql = R"(SELECT count(*) from does_intersect_a as a
@@ -502,6 +532,26 @@ TEST_F(GeospatialJoinTest, EmptyPolyPolyJoin) {
   });
 }
 
+TEST_F(GeospatialJoinTest, InnerJoinPolyPointContains) {
+  ASSERT_EQ(static_cast<int64_t>(3),
+            v<int64_t>(execSQL("SELECT COUNT(*) FROM does_intersect_a",
+                               ExecutorDeviceType::CPU)));
+  ASSERT_EQ(static_cast<int64_t>(2),
+            v<int64_t>(execSQL("SELECT COUNT(*) FROM does_intersect_b",
+                               ExecutorDeviceType::CPU)));
+  ScopeGuard reset = [orig = g_from_table_reordering] { g_from_table_reordering = orig; };
+  g_from_table_reordering = true;
+  executeAllScenarios([](const ExecutionContext ctx) -> void {
+    for (std::string geometry : {"poly", "mpoly"}) {
+      auto q =
+          "SELECT count(*) from does_intersect_a a, does_intersect_b b WHERE "
+          "ST_Contains(a." +
+          geometry + ", b.pt);";
+      ASSERT_EQ(static_cast<int64_t>(3), v<int64_t>(execSQL(q, ctx.device_type)));
+    }
+  });
+}
+
 TEST_F(GeospatialJoinTest, SkipHashtableCaching) {
   const auto enable_bbox_intersect_hashjoin_state = g_enable_bbox_intersect_hashjoin;
   const auto enable_hashjoin_many_to_many_state = g_enable_hashjoin_many_to_many;
@@ -1628,6 +1678,125 @@ TEST_F(MultiFragGeospatialJoinTest, Nullable_Geo_Exhaustive) {
   });
 }
 
+class MaxBBoxOverlapsExceededTest : public ::testing::TestWithParam<bool> {
+ protected:
+  static void SetUpTestSuite() {
+    createTable("polys_max_bbox_overlaps_1");
+    createTable("polys_max_bbox_overlaps_2");
+    createTable("polys_max_bbox_overlaps_3");
+
+    populateTables();
+  }
+
+  static void TearDownTestSuite() {
+    dropTable("polys_max_bbox_overlaps_1");
+    dropTable("polys_max_bbox_overlaps_2");
+    dropTable("polys_max_bbox_overlaps_3");
+  }
+
+  static void createTable(const std::string& table_name) {
+    QR::get()->runDDLStatement(
+        "create table " + table_name +
+        " (id int, poly geometry(polygon, 4326)) with (fragment_size = 10);");
+  }
+
+  static void dropTable(const std::string& table_name) {
+    QR::get()->runDDLStatement("drop table if exists " + table_name + ";");
+  }
+
+  static void populateTables() {
+    for (const auto& table : {"polys_max_bbox_overlaps_1",
+                              "polys_max_bbox_overlaps_2",
+                              "polys_max_bbox_overlaps_3"}) {
+      std::string insert_stmt{"insert into " + std::string{table} + " values "};
+      for (size_t i = 0; i <= kMaxBBoxOverlapsCount; i++) {
+        if (i > 0) {
+          insert_stmt += ", ";
+        }
+        std::string polygon;
+        if (table == "polys_max_bbox_overlaps_1"s) {
+          // All left table polygons intersect with all right table polygons.
+          polygon = "POLYGON ((0 0,5 0,5 5,0 5,0 0))";
+        } else if (table == "polys_max_bbox_overlaps_3"s) {
+          if (i < kMaxBBoxOverlapsCount / 2) {
+            polygon = "POLYGON ((0 0,1 0,1 1,0 1,0 0))";
+          } else {
+            polygon = "POLYGON ((2 2,3 2,3 3,2 3,2 2))";
+          }
+        } else {
+          polygon = "POLYGON ((0 0,3 0,3 3,0 3,0 0))";
+        }
+        insert_stmt += "(" + std::to_string(i) + ", '" + polygon + "')";
+      }
+      insert_stmt += ";";
+      QR::get()->runSQL(insert_stmt, ExecutorDeviceType::CPU);
+    }
+  }
+
+  void queryAndAssertError(const std::string& query,
+                           ExecutorDeviceType device_type,
+                           const std::string& error_message) {
+    try {
+      QR::get()->runSQL(query, device_type, true, false);
+      FAIL() << "An exception should have been thrown for this test case.";
+    } catch (const std::exception& e) {
+      EXPECT_EQ(error_message, std::string{e.what()});
+    }
+  }
+
+  std::string getHashedTable() {
+    std::string table_name;
+    if (GetParam()) {
+      table_name = "polys_max_bbox_overlaps_2";
+    } else {
+      table_name = "polys_max_bbox_overlaps_3";
+    }
+    return table_name;
+  }
+
+  static constexpr char const* kMaxBBoxOverlapsError{
+      "BBOX_OVERLAPS_LIMIT_EXCEEDED: Maximum supported number of bounding box "
+      "overlaps exceeded"};
+};
+
+TEST_P(MaxBBoxOverlapsExceededTest, NonGroupByAggregate) {
+  executeAllScenarios(
+      [this](const ExecutionContext ctx) -> void {
+        auto sql = "SELECT count(*) FROM polys_max_bbox_overlaps_1 AS a JOIN " +
+                   getHashedTable() + " AS b ON ST_Intersects(a.poly, b.poly);";
+        queryAndAssertError(sql, ctx.device_type, kMaxBBoxOverlapsError);
+      },
+      {true});
+}
+
+TEST_P(MaxBBoxOverlapsExceededTest, GroupByAggregate) {
+  executeAllScenarios(
+      [this](const ExecutionContext ctx) -> void {
+        auto sql = "SELECT count(*) FROM polys_max_bbox_overlaps_1 AS a JOIN " +
+                   getHashedTable() +
+                   " AS b ON ST_Intersects(a.poly, b.poly) GROUP BY a.id;";
+        queryAndAssertError(sql, ctx.device_type, kMaxBBoxOverlapsError);
+      },
+      {true});
+}
+
+TEST_P(MaxBBoxOverlapsExceededTest, NonAggregateProjection) {
+  executeAllScenarios(
+      [this](const ExecutionContext ctx) -> void {
+        auto sql = "SELECT * FROM polys_max_bbox_overlaps_1 AS a JOIN " +
+                   getHashedTable() + " AS b ON ST_Intersects(a.poly, b.poly) LIMIT 1;";
+        queryAndAssertError(sql, ctx.device_type, kMaxBBoxOverlapsError);
+      },
+      {true});
+}
+
+INSTANTIATE_TEST_SUITE_P(SingleAndMultipleBins,
+                         MaxBBoxOverlapsExceededTest,
+                         ::testing::Bool(),
+                         [](const auto& info) {
+                           return info.param ? "SingleBin" : "MultipleBins";
+                         });
+
 class ParallelLinearization : public ::testing::Test {
  protected:
   void SetUp() override { g_enable_parallel_linearization = 10; }
diff --git a/Tests/GeospatialTest.cpp b/Tests/GeospatialTest.cpp
index dd0d5c61e8..dfc2239389 100644
--- a/Tests/GeospatialTest.cpp
+++ b/Tests/GeospatialTest.cpp
@@ -254,12 +254,11 @@ void import_geospatial_null_test(const bool use_temporary_tables) {
   const std::string geospatial_null_test("DROP TABLE IF EXISTS geospatial_null_test;");
   run_ddl_statement(geospatial_null_test);
   const auto create_ddl = build_create_table_statement(
-      "id INT, p POINT, mp MULTIPOINT, l LINESTRING, ml MULTILINESTRING, "
-      "poly POLYGON, mpoly MULTIPOLYGON, gpnotnull GEOMETRY(POINT) NOT NULL, "
-      "gp4326 GEOMETRY(POINT,4326) ENCODING COMPRESSED(32), "
-      "gp4326none GEOMETRY(POINT,4326) ENCODING NONE, "
-      "gp900913 GEOMETRY(POINT,900913), gmp4326 GEOMETRY(MULTIPOINT,4326), "
-      "gl4326none GEOMETRY(LINESTRING,4326) ENCODING NONE, "
+      "id INT, x DOUBLE, y DOUBLE, p POINT, mp MULTIPOINT, l LINESTRING, ml "
+      "MULTILINESTRING, poly POLYGON, mpoly MULTIPOLYGON, gpnotnull GEOMETRY(POINT) NOT "
+      "NULL, gp4326 GEOMETRY(POINT,4326) ENCODING COMPRESSED(32), gp4326none "
+      "GEOMETRY(POINT,4326) ENCODING NONE, gp900913 GEOMETRY(POINT,900913), gmp4326 "
+      "GEOMETRY(MULTIPOINT,4326), gl4326none GEOMETRY(LINESTRING,4326) ENCODING NONE, "
       "gml4326 GEOMETRY(MULTILINESTRING,4326), gpoly4326 GEOMETRY(POLYGON,4326)",
       "geospatial_null_test",
       {"", 0},
@@ -271,6 +270,8 @@ void import_geospatial_null_test(const bool use_temporary_tables) {
   run_ddl_statement(create_ddl);
   TestHelpers::ValuesGenerator gen("geospatial_null_test");
   for (size_t i = 0; i < g_num_rows; ++i) {
+    const std::string x = std::to_string(10 * i);
+    const std::string y = std::to_string(i);
     const std::string point{"'POINT(" + std::to_string(i) + " " + std::to_string(i) +
                             ")'"};
     const std::string multipoint{
@@ -296,6 +297,8 @@ void import_geospatial_null_test(const bool use_temporary_tables) {
     const std::string mpoly{"'MULTIPOLYGON(((0 0, " + std::to_string(i + 1) + " 0, 0 " +
                             std::to_string(i + 1) + ", 0 0)))'"};
     run_multiple_agg(gen(i,
+                         (i == 8) ? "NULL" : x,
+                         (i == 9) ? "NULL" : y,
                          (i % 2 == 0) ? "NULL" : point,
                          (i % 2 == 0) ? "NULL" : multipoint,
                          (i == 1) ? "NULL" : linestring,
@@ -1080,10 +1083,7 @@ TEST_P(GeoSpatialTestTablesFixture, Basics) {
               v<int64_t>(run_simple_agg("SELECT ST_NPoints(mpoly) FROM geospatial_test "
                                         "ORDER BY ST_NPoints(l) DESC LIMIT 1;",
                                         dt)));
-    // null
-    // for a POINT, this still returns 1 even if the point value is NULL
-    // @TODO check the required behavior here and fix separately if required
-    ASSERT_EQ(static_cast<int64_t>(1),
+    ASSERT_EQ(inline_int_null_value<int>(),
               v<int64_t>(run_simple_agg(
                   "SELECT ST_NPoints(p_null) from geospatial_test limit 1", dt)));
     ASSERT_EQ(inline_int_null_value<int>(),
@@ -1101,6 +1101,9 @@ TEST_P(GeoSpatialTestTablesFixture, Basics) {
     ASSERT_EQ(inline_int_null_value<int>(),
               v<int64_t>(run_simple_agg(
                   "SELECT ST_NPoints(mpoly_null) from geospatial_test limit 1", dt)));
+    ASSERT_EQ(inline_int_null_value<int>(),
+              v<int64_t>(run_simple_agg(
+                  "SELECT ST_NPoints(ST_Point(CAST(NULL AS DOUBLE), 1.0))", dt)));
 
     // ST_SRID, ST_SetSRID
     ASSERT_EQ(static_cast<int64_t>(0),
@@ -1673,10 +1676,42 @@ TEST_P(GeoSpatialNullTablesFixture, GeoWithNulls) {
                   "SELECT COUNT(*) FROM geospatial_null_test "
                   "WHERE ST_Distance('MULTILINESTRING((-1 0, 0 1))', p) < 6.0;",
                   dt)));
+    ASSERT_EQ(static_cast<int64_t>(2),
+              v<int64_t>(run_simple_agg(
+                  "SELECT COUNT(*) FROM (SELECT ST_Distance(ST_Point(x, y), ST_Point(x, "
+                  "y)) v FROM geospatial_null_test) WHERE v IS NULL;",
+                  dt)));
 
     ASSERT_EQ("POINT (1 1)",
               boost::get<std::string>(v<NullableString>(run_simple_agg(
                   "SELECT p FROM geospatial_null_test WHERE id = 1;", dt, false))));
+
+    ASSERT_TRUE(v<int64_t>(run_simple_agg(
+        "SELECT ST_Point(1.0, CAST(NULL AS DOUBLE)) IS NULL;", dt, false)));
+
+    ASSERT_TRUE(v<int64_t>(run_simple_agg(
+        "SELECT ST_Point(CAST(NULL AS DOUBLE), 1.0) IS NULL;", dt, false)));
+
+    ASSERT_EQ(2,
+              v<int64_t>(run_simple_agg(
+                  "SELECT COUNT_IF(ST_Point(x, y) IS NULL) FROM geospatial_null_test;",
+                  dt,
+                  false)));
+
+    ASSERT_EQ(
+        2,
+        v<int64_t>(run_simple_agg(
+            "SELECT COUNT_IF(ST_X(ST_Point(x, y)) IS NULL) FROM geospatial_null_test;",
+            dt,
+            false)));
+
+    ASSERT_EQ(
+        2,
+        v<int64_t>(run_simple_agg(
+            "SELECT COUNT_IF(ST_Y(ST_Point(x, y)) IS NULL) FROM geospatial_null_test;",
+            dt,
+            false)));
+
     auto p = v<NullableString>(
         run_simple_agg("SELECT p FROM geospatial_null_test WHERE id = 2;", dt, false));
     auto p_v = boost::get<void*>(&p);
@@ -1723,6 +1758,30 @@ TEST_P(GeoSpatialNullTablesFixture, Constructors) {
         R"(SELECT ST_Transform(gp900913, 4326) FROM geospatial_null_test WHERE id = 6;)",
         dt,
         false)));
+    nullcheck_result(v<NullableString>(run_simple_agg(
+        R"(SELECT ST_Transform(ST_SetSRID(ST_Point(1.0, CAST(NULL AS DOUBLE)), 4326), 900913);)",
+        dt,
+        false)));
+    nullcheck_result(v<NullableString>(run_simple_agg(
+        R"(SELECT ST_Transform(ST_SetSRID(ST_Point(CAST(NULL AS DOUBLE), 1.0), 900913), 32601);)",
+        dt,
+        false)));
+    nullcheck_result(v<NullableString>(run_simple_agg(
+        R"(SELECT ST_Transform(ST_SetSRID(ST_Point(CAST(NULL AS DOUBLE), 1.0), 32601), 4326);)",
+        dt,
+        false)));
+    nullcheck_result(v<NullableString>(run_simple_agg(  // x IS NULL for id=8
+        R"(SELECT ST_Transform(ST_SetSRID(ST_Point(x, y), 4326), 32601) FROM geospatial_null_test WHERE id = 8;)",
+        dt,
+        false)));
+    nullcheck_result(v<NullableString>(run_simple_agg(  // y IS NULL for id=9
+        R"(SELECT ST_Transform(ST_SetSRID(ST_Point(x, y), 32601), 900913) FROM geospatial_null_test WHERE id = 9;)",
+        dt,
+        false)));
+    nullcheck_result(v<NullableString>(run_simple_agg(  // x IS NULL for id=8
+        R"(SELECT ST_Transform(ST_SetSRID(ST_Point(x, y), 900913), 32601) FROM geospatial_null_test WHERE id = 8;)",
+        dt,
+        false)));
   }
 }
 
@@ -2544,6 +2603,67 @@ TEST(GeoSpatial, Projections) {
   }
 }
 
+TEST(GeoSpatial, PointNOutOfBoundCheckNegativeIndex) {
+  for (auto dt : {ExecutorDeviceType::CPU, ExecutorDeviceType::GPU}) {
+    SKIP_NO_GPU();
+    struct TestInfo {
+      int32_t index;
+      double expected_res;
+    };
+    for (auto const& test_info :
+         {TestInfo{-1, 3}, TestInfo{-2, 2}, TestInfo{-3, 1}, TestInfo{-4, NULL_DOUBLE}}) {
+      std::ostringstream oss;
+      oss << "SELECT ST_X(ST_POINTN('LINESTRING (1 0,2 2,3 3)'," << test_info.index
+          << "));";
+      ASSERT_EQ(v<double>(run_simple_agg(oss.str(), dt, false)), test_info.expected_res);
+    }
+  }
+}
+
+TEST(GeoSpatial, PointNIndexOverflow) {
+  for (auto dt : {ExecutorDeviceType::CPU, ExecutorDeviceType::GPU}) {
+    SKIP_NO_GPU();
+    ASSERT_EQ(
+        v<int64_t>(run_simple_agg(
+            "SELECT COUNT(*) FROM (select ST_PointN(ST_GeomFromText('LINESTRING(-1 1, 1 "
+            "1)', 4326), 1241231231) IS NULL as v) WHERE v IS TRUE;",
+            dt,
+            false)),
+        static_cast<int64_t>(1));
+  }
+}
+
+TEST(GeoSpatial, PointNInvalidIndex) {
+  for (auto dt : {ExecutorDeviceType::CPU, ExecutorDeviceType::GPU}) {
+    SKIP_NO_GPU();
+    ASSERT_EQ(
+        v<int64_t>(run_simple_agg(
+            "SELECT COUNT(*) FROM (select ST_PointN(ST_GeomFromText('LINESTRING(-1 1, 1 "
+            "1)', 4326), 0) IS NULL as v) WHERE v IS TRUE;",
+            dt,
+            false)),
+        static_cast<int64_t>(1));
+    ASSERT_EQ(
+        v<int64_t>(run_simple_agg(
+            "SELECT COUNT(*) FROM (select ST_PointN(ST_GeomFromText('LINESTRING(-1 1, 1 "
+            "1)', 4326), 3) IS NULL as v) WHERE v IS TRUE;",
+            dt,
+            false)),
+        static_cast<int64_t>(1));
+  }
+}
+
+TEST(GeoSpatial, PointNIndexLargerThanInt) {
+  for (auto dt : {ExecutorDeviceType::CPU, ExecutorDeviceType::GPU}) {
+    SKIP_NO_GPU();
+    // ST_PointN expecting integer index
+    ASSERT_ANY_THROW(run_simple_agg(
+        "SELECT ST_PointN(ST_GeomFromText('LINESTRING(-1 1, 1 1)', 4326), 2147483648);",
+        dt,
+        false));
+  }
+}
+
 class GeoSpatialTempTables : public ::testing::Test {
  protected:
   void SetUp() override { import_geospatial_test(/*with_temporary_tables=*/false); }
@@ -3307,6 +3427,55 @@ TEST(GeoSpatial, DISABLED_UTMTransformCoords) {
   }
 }
 
+TEST(GeoSpatial, PointNGeoConstant) {
+  auto perform_test =
+      [](std::string query, int64_t expected, ExecutorDeviceType const& dt) {
+        ASSERT_EQ(expected, v<int64_t>(run_simple_agg(query, dt)));
+      };
+  for (auto dt : {ExecutorDeviceType::CPU, ExecutorDeviceType::GPU}) {
+    SKIP_NO_GPU();
+    perform_test("SELECT ST_NPOINTS(ST_GeomFromText('POINT(0 0)'));", 1, dt);
+    perform_test("SELECT ST_NPOINTS(ST_GeomFromText('MULTIPOINT(0 0, 1 1)'));", 2, dt);
+    perform_test("SELECT ST_NPOINTS(ST_GeomFromText('LINESTRING(0 0, 1 1)'));", 2, dt);
+    perform_test("SELECT ST_NPOINTS(ST_GeomFromText('LINESTRING(0 0, 1.1 1)'));", 2, dt);
+    perform_test(
+        "SELECT ST_NPOINTS(ST_GeomFromText('LINESTRING(0 0, 1 -1.12)'));", 2, dt);
+    perform_test(
+        "SELECT ST_NPOINTS(ST_GeomFromText('LINESTRING(0 0, 3.234E-2 -3)'));", 2, dt);
+    perform_test(
+        "SELECT ST_NPOINTS(ST_GeomFromText('LINESTRING(0 0, -3 3.234E-2)'));", 2, dt);
+    perform_test(
+        "SELECT ST_NPOINTS(ST_GeomFromText('LINESTRING(0 0, 3 -3.0E2)'));", 2, dt);
+    perform_test(
+        "SELECT ST_NPOINTS(ST_GeomFromText('MULTILINESTRING((0 0, 1 1),(2 2, 3 3))'));",
+        4,
+        dt);
+    perform_test(
+        "SELECT ST_NPOINTS(ST_GeomFromText('POLYGON((0 0, 1 1, 2 2, 0 0))'));", 4, dt);
+    perform_test(
+        "SELECT ST_NPOINTS(ST_GeomFromText('MULTIPOLYGON(((5 5, 6 6, 5 6)), ((0 0, 1 0, "
+        "0 1, 0 0)))'));",
+        7,
+        dt);
+    perform_test(
+        "SELECT ST_NPOINTS(ST_GeomFromText('MULTIPOLYGON (((9 0,9 9,0.12314124 "
+        "19.12412314,0 0,9 0),(3 3,2 2,1 1,3 3)))'));",
+        9,
+        dt);
+  }
+}
+
+TEST(GeoSpatial, InvalidGeoConstantInSTFunc) {
+  for (auto dt : {ExecutorDeviceType::CPU, ExecutorDeviceType::GPU}) {
+    SKIP_NO_GPU();
+    ASSERT_ANY_THROW(
+        run_simple_agg(R"(SELECT ST_NPoints(ST_GeomFromText('POLYGON((0 0))'));)", dt));
+    ASSERT_ANY_THROW(run_simple_agg(
+        R"(SELECT ST_Distance(ST_GeomFromText('POLYGON((0 0, 0 0))'), ST_GeomFromText('POLYGON((0 0, 0 0))'));)",
+        dt));
+  }
+}
+
 INSTANTIATE_TEST_SUITE_P(GeospatialMultiFragExecutionTests,
                          GeoSpatialMultiFragTestTablesFixture,
                          ::testing::Values(true, false));
@@ -3806,6 +3975,7 @@ int main(int argc, char** argv) {
 
   logger::LogOptions log_options(argv[0]);
   log_options.max_files_ = 0;  // stderr only by default
+  log_options.set_base_path(BASE_PATH);
   desc.add(log_options.get_options());
 
   po::variables_map vm;
diff --git a/Tests/GpuSharedMemoryTest.cpp b/Tests/GpuSharedMemoryTest.cpp
index f165ca401f..a5e20c422b 100644
--- a/Tests/GpuSharedMemoryTest.cpp
+++ b/Tests/GpuSharedMemoryTest.cpp
@@ -20,6 +20,7 @@
 #include "QueryEngine/OutputBufferInitialization.h"
 #include "QueryEngine/QueryEngine.h"
 #include "QueryEngine/ResultSetReductionJIT.h"
+#include "Tests/DataMgrTestHelpers.h"
 
 extern bool g_is_test_env;
 
@@ -182,23 +183,18 @@ std::unique_ptr<GpuDeviceCompilationContext> compile_and_link_gpu_code(
   const auto ptx =
       CodeGenerator::generatePTX(cuda_llir, nvptx_target_machine.get(), context);
 
-  auto cubin_result = ptx_to_cubin(ptx, cuda_mgr);
-  auto& option_keys = cubin_result.option_keys;
-  auto& option_values = cubin_result.option_values;
-  auto cubin = cubin_result.cubin;
-  auto link_state = cubin_result.link_state;
-  const auto num_options = option_keys.size();
+  CubinResult cubin_result = ptx_to_cubin(ptx, cuda_mgr);
   auto gpu_context =
-      std::make_unique<GpuDeviceCompilationContext>(cubin,
+      std::make_unique<GpuDeviceCompilationContext>(cubin_result.cubin,
                                                     cubin_result.cubin_size,
                                                     kernel_name,
                                                     gpu_device_idx,
                                                     cuda_mgr,
-                                                    num_options,
-                                                    &option_keys[0],
-                                                    &option_values[0]);
+                                                    cubin_result.option_keys.size(),
+                                                    cubin_result.option_keys.data(),
+                                                    cubin_result.option_values.data());
 
-  checkCudaErrors(cuLinkDestroy(link_state));
+  checkCudaErrors(cuLinkDestroy(cubin_result.link_state));
   return gpu_context;
 }
 
@@ -614,6 +610,7 @@ int main(int argc, char** argv) {
 
   TestHelpers::init_logger_stderr_only(argc, argv);
   testing::InitGoogleTest(&argc, argv);
+  TestHelpers::init_sys_catalog();
 
   g_cuda_mgr.reset(new CudaMgr_Namespace::CudaMgr(0));
   g_query_engine = QueryEngine::createInstance(g_cuda_mgr.get(), /*cpu_only=*/false);
diff --git a/Tests/Import/datafiles/raster/USGS_1m_x30y441_OH_Columbus_2019_small_last_pixel_null.tif b/Tests/Import/datafiles/raster/USGS_1m_x30y441_OH_Columbus_2019_small_last_pixel_null.tif
new file mode 100644
index 0000000000..ae27cb51b7
Binary files /dev/null and b/Tests/Import/datafiles/raster/USGS_1m_x30y441_OH_Columbus_2019_small_last_pixel_null.tif differ
diff --git a/Tests/ImportExportTest.cpp b/Tests/ImportExportTest.cpp
index 77af8a658d..11a4109a2c 100644
--- a/Tests/ImportExportTest.cpp
+++ b/Tests/ImportExportTest.cpp
@@ -5366,6 +5366,8 @@ TEST_F(BasicRasterImporterTest, HDF5ImageMultiThreaded) {
 
 static constexpr const char* kPNG = "beach.png";
 static constexpr const char* kGeoTIFF = "USGS_1m_x30y441_OH_Columbus_2019_small.tif";
+static constexpr const char* kGeoTIFFLastPixelNull =
+    "USGS_1m_x30y441_OH_Columbus_2019_small_last_pixel_null.tif";
 static constexpr const char* kGeoTIFFTruncated =
     "USGS_1m_x30y441_OH_Columbus_2019_small_truncated.tif";
 static constexpr const char* kGeoTIFFDir = "geotif/";
@@ -5801,6 +5803,13 @@ TEST_F(RasterImportTest, ImportGeoTIFFTest) {
       {{-83.222766892364277, 39.818764365787992, 287.54092407226562}}));
 }
 
+TEST_F(RasterImportTest, ImportGeoTIFFDropNullsTest) {
+  ASSERT_NO_THROW(importTestCommon(kGeoTIFFLastPixelNull,
+                                   ", raster_drop_if_all_null='true'",
+                                   "SELECT COUNT(*) FROM raster;",
+                                   {{39999L}}));
+}
+
 TEST_F(RasterImportTest, ImportGeoTIFFTruncatedTest) {
   ASSERT_NO_THROW(importTestCommon(kGeoTIFFTruncated,
                                    ", max_reject=1000000",
@@ -6229,6 +6238,7 @@ int main(int argc, char** argv) {
 
   logger::LogOptions log_options(argv[0]);
   log_options.max_files_ = 0;  // stderr only by default
+  log_options.set_base_path(BASE_PATH);
   desc.add(log_options.get_options());
 
   po::variables_map vm;
diff --git a/Tests/MLFunctionsTest.cpp b/Tests/MLFunctionsTest.cpp
index 9e06d0430b..f9cf9a3450 100644
--- a/Tests/MLFunctionsTest.cpp
+++ b/Tests/MLFunctionsTest.cpp
@@ -355,17 +355,23 @@ std::vector<std::string> get_supported_ml_frameworks() {
 }
 
 TEST_F(MLTableFunctionsTest, SupportedMLFrameworks) {
-  const std::vector<std::string> expected_ml_frameworks = {"onedal", "mlpack"};
+  const std::vector<std::string> expected_ml_frameworks = {"onedal", "oneapi", "mlpack"};
   std::vector<bool> expected_is_available;
   std::vector<bool> expected_is_default;
   bool found_ml_framework = false;
 #ifdef HAVE_ONEDAL
   expected_is_available.emplace_back(true);
-  expected_is_default.emplace_back(!found_ml_framework);
+  expected_is_default.emplace_back(
+      found_ml_framework);  // onedal should be available but not default
+  expected_is_available.emplace_back(true);
+  expected_is_default.emplace_back(
+      !found_ml_framework);  // oneapi should be available and default
   found_ml_framework = true;
 #else
   expected_is_available.emplace_back(false);
   expected_is_default.emplace_back(false);
+  expected_is_available.emplace_back(false);
+  expected_is_default.emplace_back(false);
 #endif
 
 #ifdef HAVE_MLPACK
@@ -383,7 +389,7 @@ TEST_F(MLTableFunctionsTest, SupportedMLFrameworks) {
         "TABLE(supported_ml_frameworks()) ORDER BY ml_framework DESC;";
     const auto rows = run_multiple_agg(query, dt);
     const size_t num_rows = rows->rowCount();
-    EXPECT_EQ(num_rows, size_t(2));
+    EXPECT_EQ(num_rows, size_t(3));
     EXPECT_EQ(rows->colCount(), size_t(3));
     for (size_t row_idx = 0; row_idx < num_rows; ++row_idx) {
       auto crt_row = rows->getNextRow(true, true);
@@ -863,6 +869,13 @@ TEST_P(MLRegressionFunctionsTest, REG_MODEL_FIT_NO_ROWS) {
     if (model_type != MLModelType::LINEAR_REG && ml_framework == "'mlpack'") {
       continue;
     }
+
+    // oneAPI does not support boosted trees or decision trees yet
+    if (ml_framework == "'oneapi'" && (model_type == MLModelType::GBT_REG ||
+                                       model_type == MLModelType::DECISION_TREE_REG)) {
+      continue;
+    }
+
     for (bool use_create_syntax : {false, true}) {
       for (auto dt : {ExecutorDeviceType::CPU, ExecutorDeviceType::GPU}) {
         SKIP_NO_GPU();
@@ -915,6 +928,13 @@ TEST_P(MLRegressionFunctionsTest, REG_MODEL_FIT) {
     if (model_type != MLModelType::LINEAR_REG && ml_framework == "'mlpack'") {
       continue;
     }
+
+    // oneAPI does not support boosted trees or decision trees yet
+    if (ml_framework == "'oneapi'" && (model_type == MLModelType::GBT_REG ||
+                                       model_type == MLModelType::DECISION_TREE_REG)) {
+      continue;
+    }
+
     for (bool use_create_syntax : {false, true}) {
       for (auto dt : {ExecutorDeviceType::CPU, ExecutorDeviceType::GPU}) {
         SKIP_NO_GPU();
@@ -1139,6 +1159,13 @@ TEST_P(MLRegressionFunctionsTest, MLRegPredict) {
     if (model_type != MLModelType::LINEAR_REG && ml_framework == "'mlpack'") {
       continue;
     }
+
+    // oneAPI does not support boosted trees or decision trees yet
+    if (ml_framework == "'oneapi'" && (model_type == MLModelType::GBT_REG ||
+                                       model_type == MLModelType::DECISION_TREE_REG)) {
+      continue;
+    }
+
     for (auto dt : {ExecutorDeviceType::CPU, ExecutorDeviceType::GPU}) {
       SKIP_NO_GPU();
       for (bool make_args_named : {false, true}) {
@@ -1209,6 +1236,13 @@ TEST_P(MLRegressionFunctionsTest, R2_SCORE) {
     if (model_type != MLModelType::LINEAR_REG && ml_framework == "'mlpack'") {
       continue;
     }
+
+    // oneAPI does not support boosted trees or decision trees yet
+    if (ml_framework == "'oneapi'" && (model_type == MLModelType::GBT_REG ||
+                                       model_type == MLModelType::DECISION_TREE_REG)) {
+      continue;
+    }
+
     for (auto dt : {ExecutorDeviceType::CPU, ExecutorDeviceType::GPU}) {
       for (std::string numeric_data_type : {"DOUBLE"}) {
         const auto train_cursor_query = generate_cursor_query(
@@ -1245,8 +1279,9 @@ TEST_P(MLRegressionFunctionsTest, R2_SCORE) {
         EXPECT_EQ(r2_rows->rowCount(), 1UL);
         EXPECT_EQ(r2_rows->colCount(), 1UL);
         auto r2_row = r2_rows->getNextRow(true, true);
+        // Non-deterministic, and has been seen as low as 0.945691.
         const double actual_r2 = TestHelpers::v<double>(r2_row[0]);
-        const double expected_min_r2{0.95};
+        const double expected_min_r2{0.93};
         EXPECT_GE(actual_r2, expected_min_r2);
       }
     }
@@ -1273,6 +1308,13 @@ TEST_P(MLRegressionFunctionsTest, ML_PREDICT_WRONG_NUM_REGRESSORS) {
     if (model_type != MLModelType::LINEAR_REG && ml_framework == "'mlpack'") {
       continue;
     }
+
+    // oneAPI does not support boosted trees or decision trees yet
+    if (ml_framework == "'oneapi'" && (model_type == MLModelType::GBT_REG ||
+                                       model_type == MLModelType::DECISION_TREE_REG)) {
+      continue;
+    }
+
     const std::string train_query("SELECT * FROM TABLE(" + model_fit_func +
                                   "(model_name =>'" + model_name +
                                   "', "
@@ -1301,6 +1343,13 @@ TEST_P(MLRegressionFunctionsTest, ML_PREDICT) {
     if (model_type != MLModelType::LINEAR_REG && ml_framework == "'mlpack'") {
       continue;
     }
+
+    // oneAPI does not support boosted trees or decision trees yet
+    if (ml_framework == "'oneapi'" && (model_type == MLModelType::GBT_REG ||
+                                       model_type == MLModelType::DECISION_TREE_REG)) {
+      continue;
+    }
+
     const std::string train_query(
         "SELECT * FROM TABLE(" + model_fit_func + "(model_name =>'" + model_name +
         "', "
@@ -1324,6 +1373,7 @@ TEST_P(MLRegressionFunctionsTest, ML_PREDICT) {
     const double allowed_epsilon{0.1};
     for (auto dt : {ExecutorDeviceType::CPU, ExecutorDeviceType::GPU}) {
       SKIP_NO_GPU();
+      std::cerr << "train_query: " << train_query << "\n";
       EXPECT_NO_THROW(run_multiple_agg(train_query, dt));
       const auto row_wise_prediction_avg =
           TestHelpers::v<double>(run_simple_agg(row_wise_predict_query, dt));
@@ -1345,6 +1395,13 @@ TEST_P(MLRegressionFunctionsTest, ML_PREDICT_NULLS) {
     if (model_type != MLModelType::LINEAR_REG && ml_framework == "'mlpack'") {
       continue;
     }
+
+    // oneAPI does not support boosted trees or decision trees yet
+    if (ml_framework == "'oneapi'" && (model_type == MLModelType::GBT_REG ||
+                                       model_type == MLModelType::DECISION_TREE_REG)) {
+      continue;
+    }
+
     const std::string train_query(
         "SELECT * FROM TABLE(" + model_fit_func + "(model_name =>'" + model_name +
         "', "
@@ -1419,6 +1476,13 @@ TEST_P(MLCategoricalRegressionFunctionsTest, ML_PREDICT_CATEGORICAL_FEATURES_MIS
     if (model_type != MLModelType::LINEAR_REG && ml_framework == "'mlpack'") {
       continue;
     }
+
+    // oneAPI does not support boosted trees or decision trees yet
+    if (ml_framework == "'oneapi'" && (model_type == MLModelType::GBT_REG ||
+                                       model_type == MLModelType::DECISION_TREE_REG)) {
+      continue;
+    }
+
     const std::string train_query("SELECT * FROM TABLE(" + model_fit_func +
                                   "(model_name =>'" + model_name +
                                   "', "
@@ -1454,6 +1518,13 @@ TEST_P(MLCategoricalRegressionFunctionsTest, REG_MODEL_FIT_CAT_FEATURES_ONLY) {
     if (model_type != MLModelType::LINEAR_REG && ml_framework == "'mlpack'") {
       continue;
     }
+
+    // oneAPI does not support boosted trees or decision trees yet
+    if (ml_framework == "'oneapi'" && (model_type == MLModelType::GBT_REG ||
+                                       model_type == MLModelType::DECISION_TREE_REG)) {
+      continue;
+    }
+
     for (bool use_create_syntax : {false, true}) {
       for (std::string numeric_data_type : {"DOUBLE"}) {
         const auto data_query = generate_cursor_query(
@@ -1514,6 +1585,22 @@ TEST_P(MLCategoricalRegressionFunctionsTest, REG_MODEL_FIT_MIXED_FEATURES) {
     if (model_type != MLModelType::LINEAR_REG && ml_framework == "'mlpack'") {
       continue;
     }
+
+    // oneAPI does not support boosted trees or decision trees yet
+    if (ml_framework == "'oneapi'" && (model_type == MLModelType::GBT_REG ||
+                                       model_type == MLModelType::DECISION_TREE_REG)) {
+      continue;
+    }
+
+    // FIXME: The oneAPI implementation of Linear Regression seems to be somewhat
+    // experimental, as for large models it finds different coefficients than the DAAL
+    // implementation, regardless of solver algorithm used. For now, we default to the
+    // DAAL implementation and skip testing the OneAPI versions. Once oneDAL is updated,
+    // this should be revisited to check the state of the Linear Regression models.
+    if (ml_framework == "'oneapi'" && model_type == MLModelType::LINEAR_REG) {
+      continue;
+    }
+
     for (bool use_create_syntax : {false, true}) {
       for (std::string numeric_data_type : {"DOUBLE"}) {
         const auto data_query = generate_cursor_query("craigslist_f150s",
@@ -1570,7 +1657,7 @@ TEST_P(MLCategoricalRegressionFunctionsTest, REG_MODEL_FIT_MIXED_FEATURES) {
                 1, 1, 2, 3, 4, 5, 6, 7, 8, 9,  10, 1, 2, 3, 4, 5,
                 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1,  2, 3, 4, 1, 1};
 
-            const std::vector<double> expected_coefs = {
+            std::vector<double> expected_coefs = {
                 -2681553.0554, 1725.9846, 782.4798,   1086.2600,  -2629.4910, -1294.4318,
                 2935.9767,     3032.3311, 524.7911,   -482.2400,  4439.0142,  5996.3565,
                 273.6281,      656.8885,  8127.7868,  22739.9185, -107.2775,  1322.8270,
@@ -1697,6 +1784,13 @@ TEST_P(MLCategoricalRegressionFunctionsTest, ML_PREDICT) {
     if (model_type != MLModelType::LINEAR_REG && ml_framework == "'mlpack'") {
       continue;
     }
+
+    // oneAPI does not support boosted trees or decision trees yet
+    if (ml_framework == "'oneapi'" && (model_type == MLModelType::GBT_REG ||
+                                       model_type == MLModelType::DECISION_TREE_REG)) {
+      continue;
+    }
+
     for (std::string numeric_data_type : {"DOUBLE"}) {
       // Test two different orders of features, one categorical predictors first and the
       // other in mixed order
@@ -1762,6 +1856,13 @@ TEST_P(MLCategoricalRegressionFunctionsTest, R2_SCORE) {
     if (model_type != MLModelType::LINEAR_REG && ml_framework == "'mlpack'") {
       continue;
     }
+
+    // oneAPI does not support boosted trees or decision trees yet
+    if (ml_framework == "'oneapi'" && (model_type == MLModelType::GBT_REG ||
+                                       model_type == MLModelType::DECISION_TREE_REG)) {
+      continue;
+    }
+
     for (std::string numeric_data_type : {"DOUBLE"}) {
       const std::string train_query(
           "SELECT * FROM TABLE(" + model_fit_func + "(model_name =>'" + model_name +
diff --git a/Tests/OneDALBenchmark.cpp b/Tests/OneDALBenchmark.cpp
new file mode 100644
index 0000000000..40fdc8ddcd
--- /dev/null
+++ b/Tests/OneDALBenchmark.cpp
@@ -0,0 +1,295 @@
+/*
+ * Copyright 2022 HEAVY.AI, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "TestHelpers.h"
+
+#include <benchmark/benchmark.h>
+#include <iostream>
+#include <mutex>
+#include <thread>
+
+#include "../ImportExport/Importer.h"
+#include "../Logger/Logger.h"
+#include "../QueryEngine/ResultSet.h"
+#include "../QueryRunner/QueryRunner.h"
+
+#ifndef BASE_PATH
+#define BASE_PATH "./tmp"
+#endif
+
+using QR = QueryRunner::QueryRunner;
+
+inline void run_ddl_statement(const std::string& create_table_stmt) {
+  QR::get()->runDDLStatement(create_table_stmt);
+}
+
+std::shared_ptr<ResultSet> run_multiple_agg(const std::string& query_str,
+                                            const ExecutorDeviceType device_type) {
+  return QR::get()->runSQL(
+      query_str, device_type, /*hoist_literals=*/true, /*allow_loop_joins=*/true);
+}
+
+TargetValue run_simple_agg(const std::string& query_str,
+                           const ExecutorDeviceType device_type) {
+  auto rows = QR::get()->runSQL(query_str, device_type, /*allow_loop_joins=*/true);
+  auto crt_row = rows->getNextRow(true, true);
+  CHECK_EQ(size_t(1), crt_row.size()) << query_str;
+  return crt_row[0];
+}
+
+std::once_flag setup_flag;
+void global_setup() {
+  TestHelpers::init_logger_stderr_only();
+  QR::init(BASE_PATH);
+
+  boost::filesystem::path lz4_data_path{
+      "../../Tests/OneDALBenchmarkDataFiles/florida_parcels_2020.dump.lz4"};
+
+  if (!boost::filesystem::exists(lz4_data_path)) {
+    throw std::runtime_error("florida_parcels data not found at " +
+                             boost::filesystem::canonical(lz4_data_path).string());
+  }
+
+  run_ddl_statement("DROP TABLE IF EXISTS florida_parcels_2020;");
+  run_ddl_statement("RESTORE TABLE florida_parcels_2020 FROM '" +
+                    boost::filesystem::canonical(lz4_data_path).string() +
+                    "' WITH (COMPRESSION='lz4');");
+
+  // make sure we're warmed up
+  run_multiple_agg("SELECT * FROM florida_parcels_2020 LIMIT 10000;",
+                   ExecutorDeviceType::CPU);
+}
+
+class DalFixture : public benchmark::Fixture {
+ public:
+  void SetUp(const ::benchmark::State& state) override {
+    std::call_once(setup_flag, global_setup);
+  }
+};
+
+//! Run KMeans clustering for OneDAL
+BENCHMARK_DEFINE_F(DalFixture, OneDALKMeansClustering)(benchmark::State& state) {
+  for (auto _ : state) {
+    run_multiple_agg(
+        "SELECT * FROM TABLE(KMEANS(data => CURSOR(SELECT PARCELID as id, LAT_DD, "
+        "LONG_DD "
+        "FROM florida_parcels_2020), num_clusters => " +
+            std::to_string(state.range(0)) + ", num_iterations => " +
+            std::to_string(state.range(1)) +
+            ", preferred_ml_framework => 'ONEDAL')) ORDER BY id;",
+        ExecutorDeviceType::CPU);
+  }
+}
+
+//! Run KMeans clustering for OneAPI
+BENCHMARK_DEFINE_F(DalFixture, OneAPIKMeansClustering)(benchmark::State& state) {
+  for (auto _ : state) {
+    run_multiple_agg(
+        "SELECT * FROM TABLE(KMEANS(data => CURSOR(SELECT PARCELID as id, LAT_DD, "
+        "LONG_DD "
+        "FROM florida_parcels_2020), num_clusters => " +
+            std::to_string(state.range(0)) + ", num_iterations => " +
+            std::to_string(state.range(1)) +
+            ", preferred_ml_framework => 'ONEAPI')) ORDER BY id;",
+        ExecutorDeviceType::CPU);
+  }
+}
+
+//! Run DBScan clustering for OneDAL
+BENCHMARK_DEFINE_F(DalFixture, OneDALDBScanClustering)(benchmark::State& state) {
+  for (auto _ : state) {
+    run_multiple_agg(
+        "SELECT * FROM TABLE(DBSCAN(data => CURSOR(SELECT PARCELID as id, LAT_DD, "
+        "LONG_DD "
+        "FROM florida_parcels_2020 LIMIT 1000000), min_observations => " +
+            std::to_string(state.range(0)) +
+            ", epsilon => 0.5"
+            ", preferred_ml_framework => 'ONEDAL')) ORDER BY id;",
+        ExecutorDeviceType::CPU);
+  }
+}
+
+//! Run DBScan clustering for OneAPI
+BENCHMARK_DEFINE_F(DalFixture, OneAPIDBScanClustering)(benchmark::State& state) {
+  for (auto _ : state) {
+    run_multiple_agg(
+        "SELECT * FROM TABLE(DBSCAN(data => CURSOR(SELECT PARCELID as id, LAT_DD, "
+        "LONG_DD FROM florida_parcels_2020 LIMIT 1000000), min_observations => " +
+            std::to_string(state.range(0)) +
+            ", epsilon => 0.5, preferred_ml_framework => 'ONEAPI')) ORDER BY id;",
+        ExecutorDeviceType::CPU);
+  }
+}
+
+//! Run PCA for OneDAL
+BENCHMARK_DEFINE_F(DalFixture, OneDALPrincipalComponentAnalysis)
+(benchmark::State& state) {
+  for (auto _ : state) {
+    std::string fit_query =
+        "SELECT * FROM TABLE(PCA_FIT(data => CURSOR(SELECT LAT_DD, LONG_DD, "
+        "Shape_Length, "
+        "Shape_Area FROM florida_parcels_2020), model_name => 'PCA_MODEL_ONEDAL', "
+        "preferred_ml_framework => 'ONEDAL'));";
+    std::string project_subquery =
+        "PCA_PROJECT('PCA_MODEL_ONEDAL', LAT_DD, LONG_DD, Shape_Length, Shape_Area, ";
+    std::string project_query =
+        "SELECT LAT_DD, LONG_DD, Shape_Length, Shape_Area, " + project_subquery +
+        "1) AS pca_1, " + project_subquery + "2) AS pca_2, " + project_subquery +
+        "3) AS pca_3, " + project_subquery + "4) AS pca_4 FROM florida_parcels_2020;";
+    run_multiple_agg(fit_query, ExecutorDeviceType::CPU);
+    run_multiple_agg(project_query, ExecutorDeviceType::CPU);
+  }
+}
+
+//! Run PCA for OneAPI
+BENCHMARK_DEFINE_F(DalFixture, OneAPIPrincipalComponentAnalysis)
+(benchmark::State& state) {
+  for (auto _ : state) {
+    std::string fit_query =
+        "SELECT * FROM TABLE(PCA_FIT(data => CURSOR(SELECT LAT_DD, LONG_DD, "
+        "Shape_Length, "
+        "Shape_Area FROM florida_parcels_2020), model_name => 'PCA_MODEL_ONEAPI', "
+        "preferred_ml_framework => 'ONEAPI'));";
+    std::string project_subquery =
+        "PCA_PROJECT('PCA_MODEL_ONEAPI', LAT_DD, LONG_DD, Shape_Length, Shape_Area, ";
+    std::string project_query =
+        "SELECT LAT_DD, LONG_DD, Shape_Length, Shape_Area, " + project_subquery +
+        "1) AS pca_1, " + project_subquery + "2) AS pca_2, " + project_subquery +
+        "3) AS pca_3, " + project_subquery + "4) AS pca_4 FROM florida_parcels_2020;";
+    run_multiple_agg(fit_query, ExecutorDeviceType::CPU);
+    run_multiple_agg(project_query, ExecutorDeviceType::CPU);
+  }
+}
+
+//! Run Linear Regression for OneDAL
+BENCHMARK_DEFINE_F(DalFixture, OneDALLinearReg)(benchmark::State& state) {
+  for (auto _ : state) {
+    std::string fit_query =
+        "SELECT * FROM TABLE(LINEAR_REG_FIT(model_name => 'LINEAR_REG_ONEDAL', data => "
+        "CURSOR(SELECT CAST(SALEPRC1 AS DOUBLE), OSTATE, LAT_DD, LONG_DD, Shape_Length, "
+        "Shape_Area FROM florida_parcels_2020), preferred_ml_framework => 'ONEDAL', "
+        "cat_top_k => 10, cat_min_fraction => 0.0001));";
+    std::string predict_query =
+        "SELECT ML_PREDICT('LINEAR_REG_ONEDAL', OSTATE, LAT_DD, LONG_DD, Shape_Length, "
+        "Shape_Area) FROM florida_parcels_2020;";
+    run_multiple_agg(fit_query, ExecutorDeviceType::CPU);
+    run_multiple_agg(predict_query, ExecutorDeviceType::CPU);
+  }
+}
+
+//! Run Linear Regression for OneAPI
+BENCHMARK_DEFINE_F(DalFixture, OneAPILinearReg)(benchmark::State& state) {
+  for (auto _ : state) {
+    std::string fit_query =
+        "SELECT * FROM TABLE(LINEAR_REG_FIT(model_name => 'LINEAR_REG_ONEAPI', data => "
+        "CURSOR(SELECT CAST(SALEPRC1 AS DOUBLE), OSTATE, LAT_DD, LONG_DD, Shape_Length, "
+        "Shape_Area FROM florida_parcels_2020), preferred_ml_framework => 'ONEAPI', "
+        "cat_top_k => 10, cat_min_fraction => 0.0001));";
+    std::string predict_query =
+        "SELECT ML_PREDICT('LINEAR_REG_ONEAPI', OSTATE, LAT_DD, LONG_DD, Shape_Length, "
+        "Shape_Area) FROM florida_parcels_2020;";
+    run_multiple_agg(fit_query, ExecutorDeviceType::CPU);
+    run_multiple_agg(predict_query, ExecutorDeviceType::CPU);
+  }
+}
+
+//! Run Random Forest Regression for OneDAL
+BENCHMARK_DEFINE_F(DalFixture, OneDALRandomForest)(benchmark::State& state) {
+  for (auto _ : state) {
+    std::string fit_query =
+        "SELECT * FROM TABLE(RANDOM_FOREST_REG_FIT(model_name => 'RANDOM_FOREST_ONEDAL', "
+        "data => CURSOR(SELECT CAST(SALEPRC1 AS DOUBLE), OSTATE, LAT_DD, LONG_DD, "
+        "Shape_Length, Shape_Area FROM florida_parcels_2020), preferred_ml_framework => "
+        "'ONEDAL', cat_top_k => 10, cat_min_fraction => 0.0001));";
+    std::string predict_query =
+        "SELECT ML_PREDICT('RANDOM_FOREST_ONEDAL', OSTATE, LAT_DD, LONG_DD, "
+        "Shape_Length, Shape_Area) FROM florida_parcels_2020;";
+    run_multiple_agg(fit_query, ExecutorDeviceType::CPU);
+    run_multiple_agg(predict_query, ExecutorDeviceType::CPU);
+  }
+}
+
+//! Run Random Forest Regression for OneAPI
+BENCHMARK_DEFINE_F(DalFixture, OneAPIRandomForest)(benchmark::State& state) {
+  for (auto _ : state) {
+    std::string fit_query =
+        "SELECT * FROM TABLE(RANDOM_FOREST_REG_FIT(model_name => 'RANDOM_FOREST_ONEAPI', "
+        "data => CURSOR(SELECT CAST(SALEPRC1 AS DOUBLE), OSTATE, LAT_DD, LONG_DD, "
+        "Shape_Length, Shape_Area FROM florida_parcels_2020), preferred_ml_framework => "
+        "'ONEAPI', cat_top_k => 10, cat_min_fraction => 0.0001));";
+    std::string predict_query =
+        "SELECT ML_PREDICT('RANDOM_FOREST_ONEAPI', OSTATE, LAT_DD, LONG_DD, "
+        "Shape_Length, Shape_Area) FROM florida_parcels_2020;";
+    run_multiple_agg(fit_query, ExecutorDeviceType::CPU);
+    run_multiple_agg(predict_query, ExecutorDeviceType::CPU);
+  }
+}
+
+BENCHMARK_REGISTER_F(DalFixture, OneDALKMeansClustering)
+    ->Args({3, 10})
+    ->MeasureProcessCPUTime()
+    ->UseRealTime()
+    ->Unit(benchmark::kMillisecond);
+
+BENCHMARK_REGISTER_F(DalFixture, OneDALDBScanClustering)
+    ->Args({10})
+    ->MeasureProcessCPUTime()
+    ->UseRealTime()
+    ->Unit(benchmark::kMillisecond);
+
+BENCHMARK_REGISTER_F(DalFixture, OneDALPrincipalComponentAnalysis)
+    ->MeasureProcessCPUTime()
+    ->UseRealTime()
+    ->Unit(benchmark::kMillisecond);
+
+BENCHMARK_REGISTER_F(DalFixture, OneDALLinearReg)
+    ->MeasureProcessCPUTime()
+    ->UseRealTime()
+    ->Unit(benchmark::kMillisecond);
+
+BENCHMARK_REGISTER_F(DalFixture, OneDALRandomForest)
+    ->MeasureProcessCPUTime()
+    ->UseRealTime()
+    ->Unit(benchmark::kMillisecond);
+
+BENCHMARK_REGISTER_F(DalFixture, OneAPIKMeansClustering)
+    ->Args({3, 10})
+    ->MeasureProcessCPUTime()
+    ->UseRealTime()
+    ->Unit(benchmark::kMillisecond);
+
+BENCHMARK_REGISTER_F(DalFixture, OneAPIDBScanClustering)
+    ->Args({10})
+    ->MeasureProcessCPUTime()
+    ->UseRealTime()
+    ->Unit(benchmark::kMillisecond);
+
+BENCHMARK_REGISTER_F(DalFixture, OneAPIPrincipalComponentAnalysis)
+    ->MeasureProcessCPUTime()
+    ->UseRealTime()
+    ->Unit(benchmark::kMillisecond);
+
+BENCHMARK_REGISTER_F(DalFixture, OneAPILinearReg)
+    ->MeasureProcessCPUTime()
+    ->UseRealTime()
+    ->Unit(benchmark::kMillisecond);
+
+BENCHMARK_REGISTER_F(DalFixture, OneAPIRandomForest)
+    ->MeasureProcessCPUTime()
+    ->UseRealTime()
+    ->Unit(benchmark::kMillisecond);
+
+BENCHMARK_MAIN();
\ No newline at end of file
diff --git a/Tests/ParallelExecutorsTest.cpp b/Tests/ParallelExecutorsTest.cpp
index 03e9e41b70..34bddf404b 100644
--- a/Tests/ParallelExecutorsTest.cpp
+++ b/Tests/ParallelExecutorsTest.cpp
@@ -142,6 +142,13 @@ class BaseTestFixture : public DBHandlerTestFixture,
     check_returned_rows("SELECT d, f, COUNT(*) FROM " + table_name +
                             " GROUP BY d, f ORDER BY f DESC NULLS LAST LIMIT 5;",
                         5);
+    sqlAndCompareResult("SELECT COUNT(*) FROM " + table_name + " WHERE str like 'hello';",
+                        {{i(10)}});
+    sqlAndCompareResult(
+        "SELECT COUNT(*) FROM " + table_name + " WHERE str ilike 'hello';", {{i(10)}});
+    sqlAndCompareResult(
+        "SELECT COUNT(*) FROM " + table_name + " WHERE str REGEXP '^[a-z]+r$';",
+        {{i(0)}});
     check_returned_rows(
         "SELECT approx_count_distinct(d), approx_count_distinct(str), i64, i32, "
         "i16 FROM " +
diff --git a/Tests/ProfileTest.cpp b/Tests/ProfileTest.cpp
index 51bcebdab6..c83b94a534 100644
--- a/Tests/ProfileTest.cpp
+++ b/Tests/ProfileTest.cpp
@@ -1309,8 +1309,8 @@ TEST(Reduction, Baseline) {
 #else
   const bool has_multi_gpus = false;
 #endif  // HAVE_CUDA
-  const auto row_set_mem_owner = std::make_shared<RowSetMemoryOwner>(
-      g_arena_block_size, 0, /*num_worker_threads=*/1);
+  const auto row_set_mem_owner =
+      std::make_shared<RowSetMemoryOwner>(g_arena_block_size, 0);
   std::vector<std::unique_ptr<ResultSet>> results;
   for (size_t i = 0; i < result_count; ++i) {
     auto rs = std::make_unique<ResultSet>(
@@ -1576,7 +1576,7 @@ TEST(Reduction, PerfectHash) {
   const bool has_multi_gpus = false;
 #endif  // HAVE_CUDA
   const auto row_set_mem_owner =
-      std::make_shared<RowSetMemoryOwner>(g_arena_block_size, 0, /*num_threads=*/1);
+      std::make_shared<RowSetMemoryOwner>(g_arena_block_size, 0);
   std::vector<std::unique_ptr<ResultSet>> results;
   for (size_t i = 0; i < result_count; ++i) {
     auto rs = std::make_unique<ResultSet>(
diff --git a/Tests/ResultSetBaselineRadixSortTest.cpp b/Tests/ResultSetBaselineRadixSortTest.cpp
index 6afe92681f..0cf534c0dc 100644
--- a/Tests/ResultSetBaselineRadixSortTest.cpp
+++ b/Tests/ResultSetBaselineRadixSortTest.cpp
@@ -25,6 +25,7 @@
 #include "QueryEngine/QueryEngine.h"
 #include "QueryEngine/ResultSet.h"
 #include "QueryEngine/RuntimeFunctions.h"
+#include "Tests/DataMgrTestHelpers.h"
 #include "Tests/ResultSetTestUtils.h"
 #include "Tests/TestHelpers.h"
 
@@ -317,6 +318,7 @@ int main(int argc, char** argv) {
 
   TestHelpers::init_logger_stderr_only(argc, argv);
   testing::InitGoogleTest(&argc, argv);
+  TestHelpers::init_sys_catalog();
 
 #ifdef HAVE_CUDA
   try {
diff --git a/Tests/ResultSetTest.cpp b/Tests/ResultSetTest.cpp
index 01c137719e..953180ab13 100644
--- a/Tests/ResultSetTest.cpp
+++ b/Tests/ResultSetTest.cpp
@@ -29,6 +29,7 @@
 #include "QueryEngine/RuntimeFunctions.h"
 #include "QueryRunner/QueryRunner.h"
 #include "StringDictionary/StringDictionary.h"
+#include "Tests/DataMgrTestHelpers.h"
 #include "Tests/TestHelpers.h"
 
 #include <gtest/gtest.h>
@@ -62,13 +63,10 @@ bool skip_tests(const ExecutorDeviceType device_type) {
 TEST(Construct, Allocate) {
   std::vector<TargetInfo> target_infos;
   QueryMemoryDescriptor query_mem_desc;
-  ResultSet result_set(target_infos,
-                       ExecutorDeviceType::CPU,
-                       query_mem_desc,
-                       std::make_shared<RowSetMemoryOwner>(Executor::getArenaBlockSize(),
-                                                           Executor::UNITARY_EXECUTOR_ID),
-                       0,
-                       0);
+  auto row_set_mem_owner = std::make_shared<RowSetMemoryOwner>(
+      Executor::getArenaBlockSize(), Executor::UNITARY_EXECUTOR_ID);
+  ResultSet result_set(
+      target_infos, ExecutorDeviceType::CPU, query_mem_desc, row_set_mem_owner, 0, 0);
   result_set.allocateStorage();
 }
 
@@ -3172,6 +3170,12 @@ int main(int argc, char** argv) {
 
   QR::init(BASE_PATH);
 
+  // Set a large enough max CPU slab size for the ReduceLargeBuffers* test cases.
+  SystemParameters sys_params;
+  sys_params.max_cpu_slab_size = 12500000000;
+  auto& data_mgr = Catalog_Namespace::SysCatalog::instance().getDataMgr();
+  data_mgr.resetBufferMgrs({}, 0, sys_params);
+
   int err{0};
   try {
     err = RUN_ALL_TESTS();
diff --git a/Tests/RuntimeInterruptTest.cpp b/Tests/RuntimeInterruptTest.cpp
index d04dc90093..157b0d7eec 100644
--- a/Tests/RuntimeInterruptTest.cpp
+++ b/Tests/RuntimeInterruptTest.cpp
@@ -585,7 +585,7 @@ TEST(Non_Kernel_Time_Interrupt, Interrupt_COPY_statement_CSV) {
     try {
       QR::get()->runDDLStatement(import_very_large_table_str);
     } catch (const QueryExecutionError& e) {
-      if (e.getErrorCode() == Executor::ERR_INTERRUPTED) {
+      if (e.hasErrorCode(ErrorCode::INTERRUPTED)) {
         catchInterruption.store(true);
       } else {
         throw e;
@@ -667,7 +667,7 @@ TEST(Non_Kernel_Time_Interrupt, Interrupt_COPY_statement_Parquet) {
     try {
       QR::get()->runDDLStatement(import_very_large_parquet_table_str);
     } catch (const QueryExecutionError& e) {
-      if (e.getErrorCode() == Executor::ERR_INTERRUPTED) {
+      if (e.hasErrorCode(ErrorCode::INTERRUPTED)) {
         catchInterruption.store(true);
       } else {
         throw e;
@@ -750,7 +750,7 @@ TEST(Non_Kernel_Time_Interrupt, Interrupt_COPY_statement_CSV_Sharded) {
     try {
       QR::get()->runDDLStatement(import_very_large_sharded_table_str);
     } catch (const QueryExecutionError& e) {
-      if (e.getErrorCode() == Executor::ERR_INTERRUPTED) {
+      if (e.hasErrorCode(ErrorCode::INTERRUPTED)) {
         catchInterruption.store(true);
       } else {
         throw e;
@@ -833,7 +833,7 @@ TEST(Non_Kernel_Time_Interrupt, Interrupt_COPY_statement_GDAL) {
     try {
       geofile_importer_for_interrupt_test(session1);
     } catch (const QueryExecutionError& e) {
-      if (e.getErrorCode() == Executor::ERR_INTERRUPTED) {
+      if (e.hasErrorCode(ErrorCode::INTERRUPTED)) {
         catchInterruption.store(true);
       } else {
         throw e;
@@ -914,7 +914,7 @@ TEST(Non_Kernel_Time_Interrupt, Interrupt_COPY_statement_Geo) {
     try {
       QR::get()->runDDLStatement(import_geo_table_str);
     } catch (const QueryExecutionError& e) {
-      if (e.getErrorCode() == Executor::ERR_INTERRUPTED) {
+      if (e.hasErrorCode(ErrorCode::INTERRUPTED)) {
         catchInterruption.store(true);
       } else {
         throw e;
@@ -1017,7 +1017,7 @@ TEST(Non_Kernel_Time_Interrupt, Interrupt_During_Reduction) {
                 session1);
       CHECK_EQ(assigned_executor_ids.size(), static_cast<size_t>(1));
     } catch (const QueryExecutionError& e) {
-      if (e.getErrorCode() == Executor::ERR_INTERRUPTED) {
+      if (e.hasErrorCode(ErrorCode::INTERRUPTED)) {
         // timing issue... the query is interrupted before entering the reduction
         catchInterruption.store(true);
       } else {
diff --git a/Tests/SQLHintTest.cpp b/Tests/SQLHintTest.cpp
index 5839b4c293..9a0d53ca8a 100644
--- a/Tests/SQLHintTest.cpp
+++ b/Tests/SQLHintTest.cpp
@@ -243,6 +243,16 @@ TEST(QueryHint, ForceToCPUMode) {
                                            {{QueryHint::kCpuMode, false}}));
 }
 
+TEST(QueryHint, DisableTableReordering) {
+  const auto q =
+      "SELECT /*+ table_reordering_off */ * FROM JOIN_HINT_TEST R, JOIN_HINT_TEST S "
+      "WHERE R.v = S.v";
+  auto query_hints = QR::get()->getParsedQueryHint(q);
+  const bool hint_enabled = query_hints.isHintRegistered(QueryHint::kTableReorderingOff);
+  EXPECT_TRUE(hint_enabled);
+  EXPECT_TRUE(check_serialized_rel_alg_dag(q, {{QueryHint::kTableReorderingOff, false}}));
+}
+
 TEST(QueryHint, QueryHintForBoundingBoxIntersection) {
   ScopeGuard reset_loop_join_state = [orig_bbox_intersect_hash_join =
                                           g_enable_bbox_intersect_hashjoin] {
@@ -1367,6 +1377,33 @@ TEST(QueryHint, ProjectionScanLimit) {
   EXPECT_FALSE(projection_q2_hint.has_value());
 }
 
+TEST(QueryHint, NDVGroupsEstimatorCorrection) {
+  const auto q1 =
+      "SELECT /*+ ndv_groups_estimator_multiplier(1.0) */ * FROM JOIN_HINT_TEST R;";
+  auto q1_hints = QR::get()->getParsedQueryHint(q1);
+  EXPECT_TRUE(q1_hints.isHintRegistered(QueryHint::kNDVGroupsEstimatorMultiplier));
+
+  const auto q2 =
+      "SELECT /*+ ndv_groups_estimator_multiplier(0.9) */ * FROM JOIN_HINT_TEST R;";
+  auto q2_hints = QR::get()->getParsedQueryHint(q2);
+  EXPECT_FALSE(q2_hints.isHintRegistered(QueryHint::kNDVGroupsEstimatorMultiplier));
+
+  const auto q3 =
+      "SELECT /*+ ndv_groups_estimator_multiplier(-1.1) */ * FROM JOIN_HINT_TEST R;";
+  auto q3_hints = QR::get()->getParsedQueryHint(q3);
+  EXPECT_FALSE(q3_hints.isHintRegistered(QueryHint::kNDVGroupsEstimatorMultiplier));
+
+  const auto q4 =
+      "SELECT /*+ ndv_groups_estimator_multiplier(2.0) */ * FROM JOIN_HINT_TEST R;";
+  auto q4_hints = QR::get()->getParsedQueryHint(q4);
+  EXPECT_TRUE(q4_hints.isHintRegistered(QueryHint::kNDVGroupsEstimatorMultiplier));
+
+  const auto q5 =
+      "SELECT /*+ ndv_groups_estimator_multiplier(2.1) */ * FROM JOIN_HINT_TEST R;";
+  auto q5_hints = QR::get()->getParsedQueryHint(q5);
+  EXPECT_FALSE(q5_hints.isHintRegistered(QueryHint::kNDVGroupsEstimatorMultiplier));
+}
+
 int main(int argc, char** argv) {
   TestHelpers::init_logger_stderr_only(argc, argv);
   testing::InitGoogleTest(&argc, argv);
diff --git a/Tests/ShardedTableEpochConsistencyTest.cpp b/Tests/ShardedTableEpochConsistencyTest.cpp
index 7d7f2fa4d2..78118dc0da 100644
--- a/Tests/ShardedTableEpochConsistencyTest.cpp
+++ b/Tests/ShardedTableEpochConsistencyTest.cpp
@@ -1156,6 +1156,7 @@ int main(int argc, char** argv) {
 
   logger::LogOptions log_options(argv[0]);
   log_options.severity_ = logger::Severity::FATAL;
+  log_options.set_base_path(BASE_PATH);
   log_options.set_options();  // update default values
   desc.add(log_options.get_options());
 
diff --git a/Tests/ShowCommandsDdlTest.cpp b/Tests/ShowCommandsDdlTest.cpp
index 48103e012a..47f8b07467 100644
--- a/Tests/ShowCommandsDdlTest.cpp
+++ b/Tests/ShowCommandsDdlTest.cpp
@@ -4543,13 +4543,14 @@ TEST_F(SystemTablesTest, MemoryDetailsSystemTableCpu) {
          {Null, Null, Null, Null,
           i(0), "CPU", "FREE", slab_pages - 1, getCpuPageSize(), i(0), i(1)}});
   } else {
+    int64_t last_touched_epoch = (g_use_cpu_mem_pool_for_output_buffers ? 1 : 0);
     sqlAndCompareResult("SELECT * FROM memory_details WHERE device_type = 'CPU' ORDER BY page_count;",
                         {{"Server", db_id, shared::kDefaultDbName, table_id, "test_table_1",
                           i(1), "i", array({db_id, table_id, i(1), i(0)}),
                           i(0), "CPU", "USED", i(1), getCpuPageSize(), i(0), i(0), i(0)},
                           {"Server", Null, Null, Null, Null, Null, Null, Null,
                           i(0), "CPU", "FREE", getAllocatedCpuPageCount() - 1,
-                          getCpuPageSize(), i(0), i(1), i(0)}});
+                          getCpuPageSize(), i(0), i(1), last_touched_epoch}});
   }
   // clang-format on
 }
diff --git a/Tests/SpecialCharsTest.cpp b/Tests/SpecialCharsTest.cpp
index 8d2109fa66..9f0e553c76 100644
--- a/Tests/SpecialCharsTest.cpp
+++ b/Tests/SpecialCharsTest.cpp
@@ -166,6 +166,7 @@ int main(int argc, char** argv) {
 
   logger::LogOptions log_options(argv[0]);
   log_options.max_files_ = 0;  // stderr only by default
+  log_options.set_base_path(BASE_PATH);
   desc.add(log_options.get_options());
 
   po::variables_map vm;
diff --git a/Tests/StringFunctionsTest.cpp b/Tests/StringFunctionsTest.cpp
index b81f3a2a96..c4362c7234 100644
--- a/Tests/StringFunctionsTest.cpp
+++ b/Tests/StringFunctionsTest.cpp
@@ -248,11 +248,11 @@ class StringFunctionTest : public TestHelpers::TbbPrivateServerKiller {
           insert into string_function_test_people values(3, 'JOHN', 'Wilson', 'John WILSON', 20, 'cA', '555-614-9814', null, 'What is the sound of one hand clapping?', 'JOHN.WILSON@geops.net');
           insert into string_function_test_people values(4, 'Sue', 'Smith', 'Sue SMITH', 25, 'CA', '555-614-2282', null, 'Nothing exists entirely alone. Everything is always in relation to everything else.', 'Find me at sue4tw@example.com, or reach me at sue.smith@example.com. I''d love to hear from you!'); 
           drop table if exists string_function_test_countries;
-          create table string_function_test_countries(id int, code text, arrow_code text, name text, short_name text encoding none, capital text, largest_city text encoding none, lang text encoding none, json_data_none text encoding none);
-          insert into string_function_test_countries values(1, 'US', '>>US<<', 'United States', null, 'Washington', 'New York City', 'en', '{"capital": "Washington D.C.", "pop": 329500000, "independence_day": "1776-07-04",  "has_prime_minister": false, "prime_minister": null, "factoids": {"gdp_per_cap_2015_2020": [56863, 58021, 60110, 63064, 65280, 63544], "Last 3 leaders": ["Barack Obama", "Donald Trump", "Joseph Biden"], "most_valuable_crop": "corn"}}');
-          insert into string_function_test_countries values(2, 'ca', '>>CA<<', 'Canada', 'Canada', 'Ottawa', 'TORONTO', 'EN', '{"capital": "Toronto", "pop": 38010000, "independence_day": "07/01/1867", "exchange_rate_usd": "0.78125", "has_prime_minister": true, "prime_minister": "Justin Trudeau", "factoids": {"gdp_per_cap_2015_2020": [43596, 42316, 45129, 46454, 46327, 43242], "Last 3 leaders": ["Paul Martin", "Stephen Harper", "Justin Trudeau"], "most valuable crop": "wheat"}}');
-          insert into string_function_test_countries values(3, 'Gb', '>>GB<<', 'United Kingdom', 'UK', 'London', 'LONDON', 'en', '{"capital": "London", "pop": 67220000, "independence_day": "N/A", "exchange_rate_usd": 1.21875, "prime_minister": "Boris Johnson", "has_prime_minister": true, "factoids": {"gdp_per_cap_2015_2020": [45039, 41048, 40306, 42996, 42354, 40285], "most valuable crop": "wheat"}}');
-          insert into string_function_test_countries values(4, 'dE', '>>DE<<', 'Germany', 'Germany', 'Berlin', 'Berlin', 'de', '{"capital":"Berlin", "independence_day": "1990-10-03", "exchange_rate_usd": 1.015625, "has_prime_minister": false, "prime_minister": null, "factoids": {"gdp_per_cap_2015_2020": [41103, 42136, 44453, 47811, 46468, 45724], "most valuable crop": "wheat"}}');
+          create table string_function_test_countries(id int, code text, arrow_code text, name text, short_name text encoding none, capital text, capital_none text encoding none, largest_city text encoding none, lang text encoding none, json_data_none text encoding none);
+          insert into string_function_test_countries values(1, 'US', '>>US<<', 'United States', null, 'Washington', 'Washington', 'New York City', 'en', '{"capital": "Washington D.C.", "pop": 329500000, "independence_day": "1776-07-04",  "has_prime_minister": false, "prime_minister": null, "factoids": {"gdp_per_cap_2015_2020": [56863, 58021, 60110, 63064, 65280, 63544], "Last 3 leaders": ["Barack Obama", "Donald Trump", "Joseph Biden"], "most_valuable_crop": "corn"}}');
+          insert into string_function_test_countries values(2, 'ca', '>>CA<<', 'Canada', 'Canada', 'Ottawa', 'Ottawa', 'TORONTO', 'EN', '{"capital": "Toronto", "pop": 38010000, "independence_day": "07/01/1867", "exchange_rate_usd": "0.78125", "has_prime_minister": true, "prime_minister": "Justin Trudeau", "factoids": {"gdp_per_cap_2015_2020": [43596, 42316, 45129, 46454, 46327, 43242], "Last 3 leaders": ["Paul Martin", "Stephen Harper", "Justin Trudeau"], "most valuable crop": "wheat"}}');
+          insert into string_function_test_countries values(3, 'Gb', '>>GB<<', 'United Kingdom', 'UK', 'London', 'London', 'LONDON', 'en', '{"capital": "London", "pop": 67220000, "independence_day": "N/A", "exchange_rate_usd": 1.21875, "prime_minister": "Boris Johnson", "has_prime_minister": true, "factoids": {"gdp_per_cap_2015_2020": [45039, 41048, 40306, 42996, 42354, 40285], "most valuable crop": "wheat"}}');
+          insert into string_function_test_countries values(4, 'dE', '>>DE<<', 'Germany', 'Germany', 'Berlin', 'Berlin', 'Berlin', 'de', '{"capital":"Berlin", "independence_day": "1990-10-03", "exchange_rate_usd": 1.015625, "has_prime_minister": false, "prime_minister": null, "factoids": {"gdp_per_cap_2015_2020": [41103, 42136, 44453, 47811, 46468, 45724], "most valuable crop": "wheat"}}');
           drop table if exists numeric_to_string_test;
           create table numeric_to_string_test(b boolean, ti tinyint, si smallint, i int, bi bigint, flt float, dbl double, dec_5_2 decimal(5, 2), dec_18_10 decimal(18, 10), dt date, ts_0 timestamp(0), ts_3 timestamp(3), tm time, b_str text, ti_str text, si_str text, i_str text, bi_str text, flt_str text, dbl_str text, dec_5_2_str text, dec_18_10_str text, dt_str text, ts_0_str text, ts_3_str text, tm_str text) with (fragment_size=2);
           insert into numeric_to_string_test values (true, 21, 21, 21, 21, 1.25, 1.25, 1.25, 1.25, '2013-09-10', '2013-09-10 12:43:23', '2013-09-10 12:43:23.123', '12:43:23', 'true', '21', '21', '21', '21', '1.250000', '1.250000', ' 1.25', '      1.2500000000', '2013-09-10', '2013-09-10 12:43:23', '2013-09-10 12:43:23.123', '12:43:23');
@@ -1397,13 +1397,91 @@ TEST_F(StringFunctionTest, RegexpSubstrLiteral) {
     SKIP_NO_GPU();
     auto result_set =
         sql("select regexp_substr('Feel free to send us an email at spam@devnull.com!', "
-            "'[[:alnum:]]+@[[:alnum:]]+.[[:alnum:]]+',  1, -1, 'i', 0);",
+            "'[[:alnum:]]+@[[:alnum:]]+\\.[[:alnum:]]+',  1, -1, 'i', 0);",
             dt);
     std::vector<std::vector<ScalarTargetValue>> expected_result_set{{"spam@devnull.com"}};
     compare_result_set(expected_result_set, result_set);
   }
 }
 
+TEST_F(StringFunctionTest, RegexpCount2Args) {
+  for (auto dt : {ExecutorDeviceType::CPU, ExecutorDeviceType::GPU}) {
+    SKIP_NO_GPU();
+    auto result_set =
+        sql("select regexp_count(json_data_none, 'in') "
+            "from string_function_test_countries order by id asc;",
+            dt);
+    std::vector<std::vector<ScalarTargetValue>> expected_result_set{
+        {int64_t(4)}, {int64_t(6)}, {int64_t(3)}, {int64_t(4)}};
+    compare_result_set(expected_result_set, result_set);
+  }
+}
+
+TEST_F(StringFunctionTest, RegexpCount3Args) {
+  // 3rd argument to RegexpCount is starting position to search for matches
+  for (auto dt : {ExecutorDeviceType::CPU, ExecutorDeviceType::GPU}) {
+    SKIP_NO_GPU();
+    auto result_set =
+        sql("select regexp_count(json_data_none, 'in', 50) "
+            "from string_function_test_countries order by id asc;",
+            dt);
+    std::vector<std::vector<ScalarTargetValue>> expected_result_set{
+        {int64_t(3)}, {int64_t(5)}, {int64_t(2)}, {int64_t(2)}};
+    compare_result_set(expected_result_set, result_set);
+  }
+}
+
+TEST_F(StringFunctionTest, RegexpCount4Args) {
+  // 4th argument to RegexpCount is for regex parameters.
+  // Notably 'c' specifies case sensitive, and 'i' specifies case insensitive
+  for (auto dt : {ExecutorDeviceType::CPU, ExecutorDeviceType::GPU}) {
+    SKIP_NO_GPU();
+    {
+      // Case-senstive default
+      auto result_set =
+          sql("select regexp_count(personal_motto, 'one', 1) from "
+              "string_function_test_people order by id asc;",
+              dt);
+      std::vector<std::vector<ScalarTargetValue>> expected_result_set{
+          {int64_t(2)}, {int64_t(1)}, {int64_t(1)}, {int64_t(1)}};
+      compare_result_set(expected_result_set, result_set);
+    }
+    {
+      // Case-senstive default
+      auto result_set =
+          sql("select regexp_count(personal_motto, 'one', 1, 'c') from "
+              "string_function_test_people order by id asc;",
+              dt);
+      std::vector<std::vector<ScalarTargetValue>> expected_result_set{
+          {int64_t(2)}, {int64_t(1)}, {int64_t(1)}, {int64_t(1)}};
+      compare_result_set(expected_result_set, result_set);
+    }
+    {
+      // Case-insenstive search
+      auto result_set =
+          sql("select regexp_count(personal_motto, 'one', 1, 'i') from "
+              "string_function_test_people order by id asc;",
+              dt);
+      std::vector<std::vector<ScalarTargetValue>> expected_result_set{
+          {int64_t(2)}, {int64_t(2)}, {int64_t(1)}, {int64_t(1)}};
+      compare_result_set(expected_result_set, result_set);
+    }
+  }
+}
+
+TEST_F(StringFunctionTest, RegexpCountLiteral) {
+  for (auto dt : {ExecutorDeviceType::CPU, ExecutorDeviceType::GPU}) {
+    SKIP_NO_GPU();
+    auto result_set =
+        sql("select regexp_count('Feel free to send us an email at spam@devnull.com or "
+            "to morespam@doa.com!', "
+            "'[[:alnum:]]+@[[:alnum:]]+\\.[[:alnum:]]+',  1, 'i');",
+            dt);
+    std::vector<std::vector<ScalarTargetValue>> expected_result_set{{int64_t(2)}};
+    compare_result_set(expected_result_set, result_set);
+  }
+}
+
 TEST_F(StringFunctionTest, JsonValue) {
   for (auto dt : {ExecutorDeviceType::CPU, ExecutorDeviceType::GPU}) {
     SKIP_NO_GPU();
@@ -1635,6 +1713,101 @@ TEST_F(StringFunctionTest, Base64) {
   }
 }
 
+class UrlEncodeDecode : public StringFunctionTest {
+ public:
+  struct Test {
+    std::string_view decoded;
+    std::string_view encoded;
+    std::string_view test_name;
+  };
+  using Param = std::tuple<ExecutorDeviceType, Test>;
+  // NOTE: test names must be non-empty, unique, and may only contain ASCII alphanumeric
+  // characters. [Use underscores carefully.]
+  static std::string testName(testing::TestParamInfo<Param> const& info) {
+    std::ostringstream oss;
+    oss << std::get<0>(info.param) << '_' << std::get<1>(info.param).test_name;
+    return oss.str();
+  }
+};
+
+class UrlEncode : public UrlEncodeDecode,
+                  public testing::WithParamInterface<UrlEncodeDecode::Param> {
+ public:
+  void test_encode(ExecutorDeviceType const dt, Test const test) {
+    std::ostringstream query;
+    query << "SELECT '" << test.encoded << "' = URL_ENCODE('" << test.decoded << "');";
+    EXPECT_TRUE(v<int64_t>(run_simple_agg(query.str(), dt))) << query.str();
+  }
+};
+
+TEST_P(UrlEncode, Test) {
+  auto const [dt, test] = GetParam();
+  if (!skip_tests(dt)) {
+    test_encode(dt, test);
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    StringFunctionTest,
+    UrlEncode,
+    testing::Combine(testing::Values(ExecutorDeviceType::CPU, ExecutorDeviceType::GPU),
+                     testing::Values(UrlEncode::Test{"Hello World!",
+                                                     "Hello+World%21",
+                                                     "hello_world"})),
+    UrlEncode::testName);
+
+class UrlDecode : public UrlEncodeDecode,
+                  public testing::WithParamInterface<UrlEncodeDecode::Param> {
+ public:
+  void test_decode(ExecutorDeviceType const dt, Test const test) {
+    std::ostringstream query;
+    query << "SELECT '" << test.decoded << "' = URL_DECODE('" << test.encoded << "');";
+    EXPECT_TRUE(v<int64_t>(run_simple_agg(query.str(), dt))) << query.str();
+  }
+};
+
+TEST_P(UrlDecode, Test) {
+  auto const [dt, test] = GetParam();
+  if (!skip_tests(dt)) {
+    test_decode(dt, test);
+  }
+}
+
+// If % is one of the last two characters, it should not be decoded by URL_DECODE().
+INSTANTIATE_TEST_SUITE_P(
+    StringFunctionTest,
+    UrlDecode,
+    testing::Combine(
+        testing::Values(ExecutorDeviceType::CPU, ExecutorDeviceType::GPU),
+        testing::Values(UrlDecode::Test{"100%", "%3100%", "100_percent"},
+                        UrlDecode::Test{"100%!", "%3100%!", "100_percent_exclaim"},
+                        UrlDecode::Test{"100A", "%3100%41", "100A"})),
+    UrlDecode::testName);
+
+TEST_F(StringFunctionTest, UrlEncodeAndDecodeInversesAndNull) {
+  for (auto dt : {ExecutorDeviceType::CPU, ExecutorDeviceType::GPU}) {
+    SKIP_NO_GPU();
+    // Verify URL_DECODE() is inverse of URL_ENCODE()
+    char const* query =
+        "SELECT COUNT(*) = COUNT_IF(personal_motto = "
+        "URL_DECODE(URL_ENCODE(personal_motto))) FROM string_function_test_people;";
+    EXPECT_TRUE(v<int64_t>(run_simple_agg(query, dt))) << query;
+    // Verify empty string and NULL behavior
+    query =
+        "SELECT URL_ENCODE(b_str) IS NULL FROM numeric_to_string_test ORDER BY b_str "
+        "NULLS FIRST LIMIT 1;";
+    EXPECT_TRUE(v<int64_t>(run_simple_agg(query, dt))) << query;
+    query =
+        "SELECT URL_DECODE(b_str) IS NULL FROM numeric_to_string_test ORDER BY b_str "
+        "NULLS FIRST LIMIT 1;";
+    EXPECT_TRUE(v<int64_t>(run_simple_agg(query, dt))) << query;
+    query = "SELECT URL_ENCODE('') IS NULL;";
+    EXPECT_TRUE(v<int64_t>(run_simple_agg(query, dt))) << query;
+    query = "SELECT URL_DECODE('') IS NULL;";
+    EXPECT_TRUE(v<int64_t>(run_simple_agg(query, dt))) << query;
+  }
+}
+
 TEST_F(StringFunctionTest, TryCastIntegerTypes) {
   for (auto dt : {ExecutorDeviceType::CPU, ExecutorDeviceType::GPU}) {
     SKIP_NO_GPU();
@@ -2212,6 +2385,132 @@ TEST_F(StringFunctionTest, LevenshteinDistance) {
   }
 }
 
+TEST_F(StringFunctionTest, Hash) {
+  for (auto dt : {ExecutorDeviceType::CPU, ExecutorDeviceType::GPU}) {
+    SKIP_NO_GPU();
+    {
+      // Literal hash
+      auto result_set = sql("select hash('hi');", dt);
+      std::vector<std::vector<ScalarTargetValue>> expected_result_set{{int64_t(1097802)}};
+      compare_result_set(expected_result_set, result_set);
+    }
+    {
+      // Literal null
+      auto result_set = sql("select coalesce(hash(CAST(NULL AS TEXT)), 0);", dt);
+      std::vector<std::vector<ScalarTargetValue>> expected_result_set{{int64_t(0)}};
+      compare_result_set(expected_result_set, result_set);
+    }
+    {
+      // Dictionary-encoded text column
+      auto result_set = sql(
+          "select hash(capital) from string_function_test_countries order by id;", dt);
+      std::vector<std::vector<ScalarTargetValue>> expected_result_set{
+          {int64_t(5703505280371710991)},
+          {int64_t(1060071279222666409)},
+          {int64_t(1057111063818803959)},
+          {int64_t(1047250289947889561)}};
+      compare_result_set(expected_result_set, result_set);
+    }
+    {
+      // None-encoded text column
+      auto result_set = sql(
+          "select hash(capital_none) from string_function_test_countries order by id;",
+          dt);
+      std::vector<std::vector<ScalarTargetValue>> expected_result_set{
+          {int64_t(5703505280371710991)},
+          {int64_t(1060071279222666409)},
+          {int64_t(1057111063818803959)},
+          {int64_t(1047250289947889561)}};
+      compare_result_set(expected_result_set, result_set);
+    }
+    {
+      // Dictionary-encoded text column with nulls
+      auto result_set =
+          sql("select coalesce(hash(zip_plus_4), 0) from string_function_test_people "
+              "order by id;",
+              dt);
+      std::vector<std::vector<ScalarTargetValue>> expected_result_set{
+          {int64_t(6345224789068548647)},
+          {int64_t(-3868673234647279706)},
+          {int64_t(0)},
+          {int64_t(0)}};
+      compare_result_set(expected_result_set, result_set);
+    }
+    {
+      // None-encoded text column with nulls
+      auto result_set =
+          sql("select coalesce(hash(short_name), 0) from string_function_test_countries "
+              "order by id;",
+              dt);
+      std::vector<std::vector<ScalarTargetValue>> expected_result_set{
+          {int64_t(0)},
+          {int64_t(1048231423487679005)},
+          {int64_t(1078829)},
+          {int64_t(-2445200816347761128)}};
+      compare_result_set(expected_result_set, result_set);
+    }
+    {
+      // Hash comparison
+      auto result_set =
+          sql("select count(*) from string_function_test_countries where "
+              "hash(capital) = hash(capital_none);",
+              dt);
+      std::vector<std::vector<ScalarTargetValue>> expected_result_set{{int64_t(4)}};
+      compare_result_set(expected_result_set, result_set);
+    }
+    {
+      auto result_set =
+          sql("select hash(lower(first_name)), any_value(lower(first_name)), count(*) "
+              "from string_function_test_people group by  hash(lower(first_name)) order "
+              "by count(*) desc;",
+              dt);
+      std::vector<std::vector<ScalarTargetValue>> expected_result_set{
+          {int64_t(1093213190016), "john", int64_t(3)},
+          {int64_t(1105454758), "sue", int64_t(1)}};
+      compare_result_set(expected_result_set, result_set);
+    }
+  }
+}
+
+TEST_F(StringFunctionTest, NullLiteralTest) {
+  for (auto dt : {ExecutorDeviceType::CPU, ExecutorDeviceType::GPU}) {
+    SKIP_NO_GPU();
+    {
+      auto result_set =
+          sql("SELECT COUNT(str_fn) FROM (SELECT short_name, REGEXP_COUNT(CAST(NULL AS "
+              "TEXT),'u',0,'i') AS str_fn FROM string_function_test_countries);",
+              dt);
+      std::vector<std::vector<ScalarTargetValue>> expected_result_set{{int64_t(0)}};
+      compare_result_set(expected_result_set, result_set);
+    }
+    {
+      auto result_set =
+          sql("SELECT COUNT(str_fn) FROM (SELECT short_name, REGEXP_SUBSTR(CAST(NULL AS "
+              "TEXT),'u', 1, -1,'i', 0) AS str_fn FROM string_function_test_countries);",
+              dt);
+      std::vector<std::vector<ScalarTargetValue>> expected_result_set{{int64_t(0)}};
+      compare_result_set(expected_result_set, result_set);
+    }
+    {
+      auto result_set =
+          sql("SELECT COUNT(str_fn) FROM (SELECT short_name, POSITION('hi' in CAST(NULL "
+              "AS TEXT)) AS str_fn FROM string_function_test_countries);",
+              dt);
+      std::vector<std::vector<ScalarTargetValue>> expected_result_set{{int64_t(0)}};
+      compare_result_set(expected_result_set, result_set);
+    }
+    {
+      auto result_set =
+          sql("SELECT COUNT(str_fn) FROM (SELECT short_name, "
+              "JAROWINKLER_SIMILARITY(CAST(NULL AS TEXT), CAST(NULL AS TEXT)) AS str_fn "
+              "FROM string_function_test_countries);",
+              dt);
+      std::vector<std::vector<ScalarTargetValue>> expected_result_set{{int64_t(0)}};
+      compare_result_set(expected_result_set, result_set);
+    }
+  }
+}
+
 TEST_F(StringFunctionTest, ExplicitCastToNumeric) {
   for (auto dt : {ExecutorDeviceType::CPU, ExecutorDeviceType::GPU}) {
     SKIP_NO_GPU();
diff --git a/Tests/TestHelpers.h b/Tests/TestHelpers.h
index a79955e1a2..54509d6acc 100644
--- a/Tests/TestHelpers.h
+++ b/Tests/TestHelpers.h
@@ -335,6 +335,9 @@ std::vector<LeafHostInfo> to_leaf_host_info(std::vector<std::string>& server_inf
 void init_logger_stderr_only(int argc, char const* const* argv) {
   logger::LogOptions log_options(argv[0]);
   log_options.max_files_ = 0;  // stderr only by default
+#ifdef BASE_PATH
+  log_options.set_base_path(BASE_PATH);
+#endif
   log_options.parse_command_line(argc, argv);
   logger::init(log_options);
 }
@@ -342,6 +345,10 @@ void init_logger_stderr_only(int argc, char const* const* argv) {
 void init_logger_stderr_only() {
   logger::LogOptions log_options(nullptr);
   log_options.max_files_ = 0;  // stderr only by default
+  log_options.max_files_ = 0;  // stderr only by default
+#ifdef BASE_PATH
+  log_options.set_base_path(BASE_PATH);
+#endif
   logger::init(log_options);
 }
 
diff --git a/Tests/UpdateMetadataTest.cpp b/Tests/UpdateMetadataTest.cpp
index 137911423e..953c26c279 100644
--- a/Tests/UpdateMetadataTest.cpp
+++ b/Tests/UpdateMetadataTest.cpp
@@ -1818,6 +1818,7 @@ int main(int argc, char** argv) {
 
   logger::LogOptions log_options(argv[0]);
   log_options.max_files_ = 0;  // stderr only by default
+  log_options.set_base_path(BASE_PATH);
   desc.add(log_options.get_options());
 
   po::variables_map vm;
diff --git a/ThirdParty/googlebenchmark/.gitignore b/ThirdParty/googlebenchmark/.gitignore
index 806d04c6b3..704f56c257 100644
--- a/ThirdParty/googlebenchmark/.gitignore
+++ b/ThirdParty/googlebenchmark/.gitignore
@@ -8,8 +8,10 @@
 !/cmake/*.cmake
 !/test/AssemblyTests.cmake
 *~
+*.swp
 *.pyc
 __pycache__
+.DS_Store
 
 # lcov
 *.lcov
@@ -59,3 +61,7 @@ CMakeSettings.json
 
 # Visual Studio Code cache/options directory
 .vscode/
+
+# Python build stuff
+dist/
+*.egg-info*
diff --git a/ThirdParty/googlebenchmark/.travis-libcxx-setup.sh b/ThirdParty/googlebenchmark/.travis-libcxx-setup.sh
deleted file mode 100644
index a591743c6a..0000000000
--- a/ThirdParty/googlebenchmark/.travis-libcxx-setup.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/usr/bin/env bash
-
-# Install a newer CMake version
-curl -sSL https://cmake.org/files/v3.6/cmake-3.6.1-Linux-x86_64.sh -o install-cmake.sh
-chmod +x install-cmake.sh
-sudo ./install-cmake.sh --prefix=/usr/local --skip-license
-
-# Checkout LLVM sources
-git clone --depth=1 https://github.com/llvm-mirror/llvm.git llvm-source
-git clone --depth=1 https://github.com/llvm-mirror/libcxx.git llvm-source/projects/libcxx
-git clone --depth=1 https://github.com/llvm-mirror/libcxxabi.git llvm-source/projects/libcxxabi
-
-# Setup libc++ options
-if [ -z "$BUILD_32_BITS" ]; then
-  export BUILD_32_BITS=OFF && echo disabling 32 bit build
-fi
-
-# Build and install libc++ (Use unstable ABI for better sanitizer coverage)
-mkdir llvm-build && cd llvm-build
-cmake -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${COMPILER} \
-      -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_INSTALL_PREFIX=/usr \
-      -DLIBCXX_ABI_UNSTABLE=ON \
-      -DLLVM_USE_SANITIZER=${LIBCXX_SANITIZER} \
-      -DLLVM_BUILD_32_BITS=${BUILD_32_BITS} \
-      ../llvm-source
-make cxx -j2
-sudo make install-cxxabi install-cxx
-cd ../
diff --git a/ThirdParty/googlebenchmark/.travis.yml b/ThirdParty/googlebenchmark/.travis.yml
index 6b6cfc7046..8cfed3d10d 100644
--- a/ThirdParty/googlebenchmark/.travis.yml
+++ b/ThirdParty/googlebenchmark/.travis.yml
@@ -2,10 +2,6 @@ sudo: required
 dist: trusty
 language: cpp
 
-env:
-  global:
-    - /usr/local/bin:$PATH
-
 matrix:
   include:
     - compiler: gcc
@@ -14,10 +10,6 @@ matrix:
           packages:
             - lcov
       env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Coverage
-    - compiler: gcc
-      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Debug
-    - compiler: gcc
-      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Release
     - compiler: gcc
       addons:
         apt:
@@ -48,10 +40,6 @@ matrix:
         - COMPILER=g++-6 C_COMPILER=gcc-6  BUILD_TYPE=Debug
         - ENABLE_SANITIZER=1
         - EXTRA_FLAGS="-fno-omit-frame-pointer -g -O2 -fsanitize=undefined,address -fuse-ld=gold"
-    - compiler: clang
-      env: COMPILER=clang++ C_COMPILER=clang BUILD_TYPE=Debug
-    - compiler: clang
-      env: COMPILER=clang++ C_COMPILER=clang BUILD_TYPE=Release
     # Clang w/ libc++
     - compiler: clang
       dist: xenial
@@ -150,16 +138,6 @@ matrix:
         - ENABLE_SANITIZER=1
         - EXTRA_FLAGS="-g -O2 -fno-omit-frame-pointer -fsanitize=thread -fno-sanitize-recover=all"
         - EXTRA_CXX_FLAGS="-stdlib=libc++"
-    - os: osx
-      osx_image: xcode8.3
-      compiler: clang
-      env:
-        - COMPILER=clang++ BUILD_TYPE=Debug
-    - os: osx
-      osx_image: xcode8.3
-      compiler: clang
-      env:
-        - COMPILER=clang++ BUILD_TYPE=Release
     - os: osx
       osx_image: xcode8.3
       compiler: clang
@@ -168,15 +146,10 @@ matrix:
         - BUILD_TYPE=Release
         - BUILD_32_BITS=ON
         - EXTRA_FLAGS="-m32"
-    - os: osx
-      osx_image: xcode8.3
-      compiler: gcc
-      env:
-        - COMPILER=g++-7 C_COMPILER=gcc-7  BUILD_TYPE=Debug
 
 before_script:
   - if [ -n "${LIBCXX_BUILD}" ]; then
-      source .travis-libcxx-setup.sh;
+      source .libcxx-setup.sh;
     fi
   - if [ -n "${ENABLE_SANITIZER}" ]; then
       export EXTRA_OPTIONS="-DBENCHMARK_ENABLE_ASSEMBLY_TESTS=OFF";
@@ -215,11 +188,11 @@ install:
   - if [ "${TRAVIS_OS_NAME}" == "linux" ]; then
       sudo apt-get update -qq;
       sudo apt-get install -qq unzip cmake3;
-      wget https://github.com/bazelbuild/bazel/releases/download/0.10.1/bazel-0.10.1-installer-linux-x86_64.sh --output-document bazel-installer.sh;
+      wget https://github.com/bazelbuild/bazel/releases/download/3.2.0/bazel-3.2.0-installer-linux-x86_64.sh --output-document bazel-installer.sh;
       travis_wait sudo bash bazel-installer.sh;
     fi
   - if [ "${TRAVIS_OS_NAME}" == "osx" ]; then
-      curl -L -o bazel-installer.sh https://github.com/bazelbuild/bazel/releases/download/0.10.1/bazel-0.10.1-installer-darwin-x86_64.sh;
+      curl -L -o bazel-installer.sh https://github.com/bazelbuild/bazel/releases/download/3.2.0/bazel-3.2.0-installer-darwin-x86_64.sh;
       travis_wait sudo bash bazel-installer.sh;
     fi
 
diff --git a/ThirdParty/googlebenchmark/AUTHORS b/ThirdParty/googlebenchmark/AUTHORS
index 912cbbc13c..d08c1fdb87 100644
--- a/ThirdParty/googlebenchmark/AUTHORS
+++ b/ThirdParty/googlebenchmark/AUTHORS
@@ -13,39 +13,59 @@ Alex Steele <steeleal123@gmail.com>
 Andriy Berestovskyy <berestovskyy@gmail.com>
 Arne Beer <arne@twobeer.de>
 Carto
+Cezary Skrzyński <czars1988@gmail.com>
+Christian Wassermann <christian_wassermann@web.de>
 Christopher Seymour <chris.j.seymour@hotmail.com>
+Colin Braley <braley.colin@gmail.com>
 Daniel Harvey <danielharvey458@gmail.com>
 David Coeurjolly <david.coeurjolly@liris.cnrs.fr>
 Deniz Evrenci <denizevrenci@gmail.com>
 Dirac Research 
 Dominik Czarnota <dominik.b.czarnota@gmail.com>
+Dominik Korman <kormandominik@gmail.com>
+Donald Aingworth <donalds_junk_mail@yahoo.com>
+Eric Backus <eric_backus@alum.mit.edu>
 Eric Fiselier <eric@efcs.ca>
 Eugene Zhuk <eugene.zhuk@gmail.com>
 Evgeny Safronov <division494@gmail.com>
+Fabien Pichot <pichot.fabien@gmail.com>
 Federico Ficarelli <federico.ficarelli@gmail.com>
 Felix Homann <linuxaudio@showlabor.de>
+Gergő Szitár <szitar.gergo@gmail.com>
 Google Inc.
+Henrique Bucher <hbucher@gmail.com>
 International Business Machines Corporation
 Ismael Jimenez Martinez <ismael.jimenez.martinez@gmail.com>
 Jern-Kuan Leong <jernkuan@gmail.com>
 JianXiong Zhou <zhoujianxiong2@gmail.com>
 Joao Paulo Magalhaes <joaoppmagalhaes@gmail.com>
+Jordan Williams <jwillikers@protonmail.com>
 Jussi Knuuttila <jussi.knuuttila@gmail.com>
 Kaito Udagawa <umireon@gmail.com>
 Kishan Kumar <kumar.kishan@outlook.com>
 Lei Xu <eddyxu@gmail.com>
+Marcel Jacobse <mjacobse@uni-bremen.de>
 Matt Clarkson <mattyclarkson@gmail.com>
 Maxim Vafin <maxvafin@gmail.com>
+Mike Apodaca <gatorfax@gmail.com>
+Min-Yih Hsu <yihshyng223@gmail.com>
 MongoDB Inc.
 Nick Hutchinson <nshutchinson@gmail.com>
+Norman Heino <norman.heino@gmail.com>
 Oleksandr Sochka <sasha.sochka@gmail.com>
 Ori Livneh <ori.livneh@gmail.com>
 Paul Redmond <paul.redmond@gmail.com>
 Radoslav Yovchev <radoslav.tm@gmail.com>
+Raghu Raja <raghu@enfabrica.net>
+Rainer Orth <ro@cebitec.uni-bielefeld.de>
 Roman Lebedev <lebedev.ri@gmail.com>
+Sayan Bhattacharjee <aero.sayan@gmail.com>
+Shapr3D <google-contributors@shapr3d.com>
 Shuo Chen <chenshuo@chenshuo.com>
+Staffan Tjernstrom <staffantj@gmail.com>
 Steinar H. Gunderson <sgunderson@bigfoot.com>
 Stripe, Inc.
+Tobias Schmidt <tobias.schmidt@in.tum.de>
 Yixuan Qiu <yixuanq@gmail.com>
 Yusuke Suzuki <utatane.tea@gmail.com>
 Zbigniew Skowron <zbychs@gmail.com>
diff --git a/ThirdParty/googlebenchmark/BUILD.bazel b/ThirdParty/googlebenchmark/BUILD.bazel
index 6ee69f2907..60d31d2f2e 100644
--- a/ThirdParty/googlebenchmark/BUILD.bazel
+++ b/ThirdParty/googlebenchmark/BUILD.bazel
@@ -1,13 +1,37 @@
 licenses(["notice"])
 
+config_setting(
+    name = "qnx",
+    constraint_values = ["@platforms//os:qnx"],
+    values = {
+        "cpu": "x64_qnx",
+    },
+    visibility = [":__subpackages__"],
+)
+
 config_setting(
     name = "windows",
+    constraint_values = ["@platforms//os:windows"],
     values = {
         "cpu": "x64_windows",
     },
     visibility = [":__subpackages__"],
 )
 
+config_setting(
+    name = "macos",
+    constraint_values = ["@platforms//os:macos"],
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "perfcounters",
+    define_values = {
+        "pfm": "1",
+    },
+    visibility = [":__subpackages__"],
+)
+
 cc_library(
     name = "benchmark",
     srcs = glob(
@@ -17,19 +41,40 @@ cc_library(
         ],
         exclude = ["src/benchmark_main.cc"],
     ),
-    hdrs = ["include/benchmark/benchmark.h"],
+    hdrs = [
+        "include/benchmark/benchmark.h",
+        "include/benchmark/export.h",
+    ],
     linkopts = select({
         ":windows": ["-DEFAULTLIB:shlwapi.lib"],
         "//conditions:default": ["-pthread"],
     }),
+    copts = select({
+        ":windows": [],
+        "//conditions:default": ["-Werror=old-style-cast"],
+    }),
     strip_include_prefix = "include",
     visibility = ["//visibility:public"],
+    # Only static linking is allowed; no .so will be produced.
+    # Using `defines` (i.e. not `local_defines`) means that no
+    # dependent rules need to bother about defining the macro.
+    linkstatic = True,
+    defines = [
+        "BENCHMARK_STATIC_DEFINE",
+    ] + select({
+        ":perfcounters": ["HAVE_LIBPFM"],
+        "//conditions:default": [],
+    }),
+    deps = select({
+        ":perfcounters": ["@libpfm//:libpfm"],
+        "//conditions:default": [],
+    }),
 )
 
 cc_library(
     name = "benchmark_main",
     srcs = ["src/benchmark_main.cc"],
-    hdrs = ["include/benchmark/benchmark.h"],
+    hdrs = ["include/benchmark/benchmark.h", "include/benchmark/export.h"],
     strip_include_prefix = "include",
     visibility = ["//visibility:public"],
     deps = [":benchmark"],
diff --git a/ThirdParty/googlebenchmark/CMakeLists.txt b/ThirdParty/googlebenchmark/CMakeLists.txt
index 77739d491b..ffd7deeb2f 100644
--- a/ThirdParty/googlebenchmark/CMakeLists.txt
+++ b/ThirdParty/googlebenchmark/CMakeLists.txt
@@ -1,29 +1,34 @@
-cmake_minimum_required (VERSION 3.5.1)
-
-foreach(p
-    CMP0048 # OK to clear PROJECT_VERSION on project()
-    CMP0054 # CMake 3.1
-    CMP0056 # export EXE_LINKER_FLAGS to try_run
-    CMP0057 # Support no if() IN_LIST operator
-    CMP0063 # Honor visibility properties for all targets
-    )
-  if(POLICY ${p})
-    cmake_policy(SET ${p} NEW)
-  endif()
-endforeach()
+# Require CMake 3.10. If available, use the policies up to CMake 3.22.
+cmake_minimum_required (VERSION 3.10...3.22)
 
-project (benchmark CXX)
+project (benchmark VERSION 1.8.3 LANGUAGES CXX)
 
 option(BENCHMARK_ENABLE_TESTING "Enable testing of the benchmark library." ON)
 option(BENCHMARK_ENABLE_EXCEPTIONS "Enable the use of exceptions in the benchmark library." ON)
 option(BENCHMARK_ENABLE_LTO "Enable link time optimisation of the benchmark library." OFF)
 option(BENCHMARK_USE_LIBCXX "Build and test using libc++ as the standard library." OFF)
+option(BENCHMARK_ENABLE_WERROR "Build Release candidates with -Werror." ON)
+option(BENCHMARK_FORCE_WERROR "Build Release candidates with -Werror regardless of compiler issues." OFF)
+
+if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "PGI")
+  # PGC++ maybe reporting false positives.
+  set(BENCHMARK_ENABLE_WERROR OFF)
+endif()
+if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "NVHPC")
+  set(BENCHMARK_ENABLE_WERROR OFF)
+endif()
+if(BENCHMARK_FORCE_WERROR)
+  set(BENCHMARK_ENABLE_WERROR ON)
+endif(BENCHMARK_FORCE_WERROR)
+
 if(NOT MSVC)
   option(BENCHMARK_BUILD_32_BITS "Build a 32 bit version of the library." OFF)
 else()
   set(BENCHMARK_BUILD_32_BITS OFF CACHE BOOL "Build a 32 bit version of the library - unsupported when using MSVC)" FORCE)
 endif()
-option(BENCHMARK_ENABLE_INSTALL "Enable installation of benchmark. (Projects embedding benchmark may want to turn this OFF.)" OFF)
+option(BENCHMARK_ENABLE_INSTALL "Enable installation of benchmark. (Projects embedding benchmark may want to turn this OFF.)" ON)
+option(BENCHMARK_ENABLE_DOXYGEN "Build documentation with Doxygen." OFF)
+option(BENCHMARK_INSTALL_DOCS "Enable installation of documentation." ON)
 
 # Allow unmet dependencies to be met using CMake's ExternalProject mechanics, which
 # may require downloading the source code.
@@ -32,6 +37,24 @@ option(BENCHMARK_DOWNLOAD_DEPENDENCIES "Allow the downloading and in-tree buildi
 # This option can be used to disable building and running unit tests which depend on gtest
 # in cases where it is not possible to build or find a valid version of gtest.
 option(BENCHMARK_ENABLE_GTEST_TESTS "Enable building the unit tests which depend on gtest" ON)
+option(BENCHMARK_USE_BUNDLED_GTEST "Use bundled GoogleTest. If disabled, the find_package(GTest) will be used." ON)
+
+option(BENCHMARK_ENABLE_LIBPFM "Enable performance counters provided by libpfm" OFF)
+
+# Export only public symbols
+set(CMAKE_CXX_VISIBILITY_PRESET hidden)
+set(CMAKE_VISIBILITY_INLINES_HIDDEN ON)
+
+if(MSVC)
+    # As of CMake 3.18, CMAKE_SYSTEM_PROCESSOR is not set properly for MSVC and
+    # cross-compilation (e.g. Host=x86_64, target=aarch64) requires using the
+    # undocumented, but working variable.
+    # See https://gitlab.kitware.com/cmake/cmake/-/issues/15170
+    set(CMAKE_SYSTEM_PROCESSOR ${MSVC_CXX_ARCHITECTURE_ID})
+    if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ARM")
+      set(CMAKE_CROSSCOMPILING TRUE)
+    endif()
+endif()
 
 set(ENABLE_ASSEMBLY_TESTS_DEFAULT OFF)
 function(should_enable_assembly_tests)
@@ -57,9 +80,9 @@ function(should_enable_assembly_tests)
   find_program(LLVM_FILECHECK_EXE FileCheck)
   if (LLVM_FILECHECK_EXE)
     set(LLVM_FILECHECK_EXE "${LLVM_FILECHECK_EXE}" CACHE PATH "llvm filecheck" FORCE)
-    # message(STATUS "LLVM FileCheck Found: ${LLVM_FILECHECK_EXE}")
+    message(STATUS "LLVM FileCheck Found: ${LLVM_FILECHECK_EXE}")
   else()
-    # message(STATUS "Failed to find LLVM FileCheck")
+    message(STATUS "Failed to find LLVM FileCheck")
     return()
   endif()
   set(ENABLE_ASSEMBLY_TESTS_DEFAULT ON PARENT_SCOPE)
@@ -79,23 +102,42 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 include(GetGitVersion)
 get_git_version(GIT_VERSION)
 
+# If no git version can be determined, use the version
+# from the project() command
+if ("${GIT_VERSION}" STREQUAL "0.0.0")
+  set(VERSION "${benchmark_VERSION}")
+else()
+  set(VERSION "${GIT_VERSION}")
+endif()
 # Tell the user what versions we are using
-string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" VERSION ${GIT_VERSION})
-# message(STATUS "Version: ${VERSION}")
+message(STATUS "Google Benchmark version: ${VERSION}")
 
 # The version of the libraries
 set(GENERIC_LIB_VERSION ${VERSION})
 string(SUBSTRING ${VERSION} 0 1 GENERIC_LIB_SOVERSION)
 
 # Import our CMake modules
-include(CheckCXXCompilerFlag)
 include(AddCXXCompilerFlag)
+include(CheckCXXCompilerFlag)
+include(CheckLibraryExists)
 include(CXXFeatureCheck)
 
+check_library_exists(rt shm_open "" HAVE_LIB_RT)
+
 if (BENCHMARK_BUILD_32_BITS)
   add_required_cxx_compiler_flag(-m32)
 endif()
 
+if (MSVC)
+  set(BENCHMARK_CXX_STANDARD 14)
+else()
+  set(BENCHMARK_CXX_STANDARD 11)
+endif()
+
+set(CMAKE_CXX_STANDARD ${BENCHMARK_CXX_STANDARD})
+set(CMAKE_CXX_STANDARD_REQUIRED YES)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
 if (MSVC)
   # Turn compiler warnings up to 11
   string(REGEX REPLACE "[-/]W[1-4]" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
@@ -128,20 +170,19 @@ if (MSVC)
     set(CMAKE_EXE_LINKER_FLAGS_MINSIZEREL "${CMAKE_EXE_LINKER_FLAGS_MINSIZEREL} /LTCG")
   endif()
 else()
-  # Try and enable C++11. Don't use C++14 because it doesn't work in some
-  # configurations.
-  add_cxx_compiler_flag(-std=c++11)
-  if (NOT HAVE_CXX_FLAG_STD_CXX11)
-    add_cxx_compiler_flag(-std=c++0x)
-  endif()
-
   # Turn compiler warnings up to 11
   add_cxx_compiler_flag(-Wall)
   add_cxx_compiler_flag(-Wextra)
   add_cxx_compiler_flag(-Wshadow)
-  add_cxx_compiler_flag(-Werror RELEASE)
-  add_cxx_compiler_flag(-Werror RELWITHDEBINFO)
-  add_cxx_compiler_flag(-Werror MINSIZEREL)
+  add_cxx_compiler_flag(-Wfloat-equal)
+  add_cxx_compiler_flag(-Wold-style-cast)
+  if(BENCHMARK_ENABLE_WERROR)
+      add_cxx_compiler_flag(-Werror)
+  endif()
+  if (NOT BENCHMARK_ENABLE_TESTING)
+    # Disable warning when compiling tests as gtest does not use 'override'.
+    add_cxx_compiler_flag(-Wsuggest-override)
+  endif()
   add_cxx_compiler_flag(-pedantic)
   add_cxx_compiler_flag(-pedantic-errors)
   add_cxx_compiler_flag(-Wshorten-64-to-32)
@@ -149,22 +190,23 @@ else()
   # Disable warnings regarding deprecated parts of the library while building
   # and testing those parts of the library.
   add_cxx_compiler_flag(-Wno-deprecated-declarations)
-  if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
+  if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel" OR CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM")
     # Intel silently ignores '-Wno-deprecated-declarations',
     # warning no. 1786 must be explicitly disabled.
     # See #631 for rationale.
     add_cxx_compiler_flag(-wd1786)
+    add_cxx_compiler_flag(-fno-finite-math-only)
   endif()
   # Disable deprecation warnings for release builds (when -Werror is enabled).
-  add_cxx_compiler_flag(-Wno-deprecated RELEASE)
-  add_cxx_compiler_flag(-Wno-deprecated RELWITHDEBINFO)
-  add_cxx_compiler_flag(-Wno-deprecated MINSIZEREL)
+  if(BENCHMARK_ENABLE_WERROR)
+      add_cxx_compiler_flag(-Wno-deprecated)
+  endif()
   if (NOT BENCHMARK_ENABLE_EXCEPTIONS)
     add_cxx_compiler_flag(-fno-exceptions)
   endif()
 
   if (HAVE_CXX_FLAG_FSTRICT_ALIASING)
-    if (NOT CMAKE_CXX_COMPILER_ID STREQUAL "Intel") #ICC17u2: Many false positives for Wstrict-aliasing
+    if (NOT CMAKE_CXX_COMPILER_ID STREQUAL "Intel" AND NOT CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM") #ICC17u2: Many false positives for Wstrict-aliasing
       add_cxx_compiler_flag(-Wstrict-aliasing)
     endif()
   endif()
@@ -173,12 +215,12 @@ else()
   add_cxx_compiler_flag(-wd654)
   add_cxx_compiler_flag(-Wthread-safety)
   if (HAVE_CXX_FLAG_WTHREAD_SAFETY)
-    cxx_feature_check(THREAD_SAFETY_ATTRIBUTES)
+    cxx_feature_check(THREAD_SAFETY_ATTRIBUTES "-DINCLUDE_DIRECTORIES=${PROJECT_SOURCE_DIR}/include")
   endif()
 
   # On most UNIX like platforms g++ and clang++ define _GNU_SOURCE as a
   # predefined macro, which turns on all of the wonderful libc extensions.
-  # However g++ doesn't do this in Cygwin so we have to define it ourselfs
+  # However g++ doesn't do this in Cygwin so we have to define it ourselves
   # since we depend on GNU/POSIX/BSD extensions.
   if (CYGWIN)
     add_definitions(-D_GNU_SOURCE=1)
@@ -191,6 +233,7 @@ else()
   # Link time optimisation
   if (BENCHMARK_ENABLE_LTO)
     add_cxx_compiler_flag(-flto)
+    add_cxx_compiler_flag(-Wno-lto-type-mismatch)
     if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
       find_program(GCC_AR gcc-ar)
       if (GCC_AR)
@@ -228,7 +271,8 @@ if (BENCHMARK_USE_LIBCXX)
   if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
     add_cxx_compiler_flag(-stdlib=libc++)
   elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" OR
-          "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
+          "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel" OR
+          "${CMAKE_CXX_COMPILER_ID}" STREQUAL "IntelLLVM")
     add_cxx_compiler_flag(-nostdinc++)
     message(WARNING "libc++ header path must be manually specified using CMAKE_CXX_FLAGS")
     # Adding -nodefaultlibs directly to CMAKE_<TYPE>_LINKER_FLAGS will break
@@ -242,11 +286,17 @@ if (BENCHMARK_USE_LIBCXX)
   endif()
 endif(BENCHMARK_USE_LIBCXX)
 
+set(EXTRA_CXX_FLAGS "")
+if (WIN32 AND "${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
+  # Clang on Windows fails to compile the regex feature check under C++11
+  set(EXTRA_CXX_FLAGS "-DCMAKE_CXX_STANDARD=14")
+endif()
+
 # C++ feature checks
 # Determine the correct regular expression engine to use
-cxx_feature_check(STD_REGEX)
-cxx_feature_check(GNU_POSIX_REGEX)
-cxx_feature_check(POSIX_REGEX)
+cxx_feature_check(STD_REGEX ${EXTRA_CXX_FLAGS})
+cxx_feature_check(GNU_POSIX_REGEX ${EXTRA_CXX_FLAGS})
+cxx_feature_check(POSIX_REGEX ${EXTRA_CXX_FLAGS})
 if(NOT HAVE_STD_REGEX AND NOT HAVE_GNU_POSIX_REGEX AND NOT HAVE_POSIX_REGEX)
   message(FATAL_ERROR "Failed to determine the source files for the regular expression backend")
 endif()
@@ -254,10 +304,16 @@ if (NOT BENCHMARK_ENABLE_EXCEPTIONS AND HAVE_STD_REGEX
         AND NOT HAVE_GNU_POSIX_REGEX AND NOT HAVE_POSIX_REGEX)
   message(WARNING "Using std::regex with exceptions disabled is not fully supported")
 endif()
+
 cxx_feature_check(STEADY_CLOCK)
 # Ensure we have pthreads
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)
+cxx_feature_check(PTHREAD_AFFINITY)
+
+if (BENCHMARK_ENABLE_LIBPFM)
+  find_package(PFM)
+endif()
 
 # Set up directories
 include_directories(${PROJECT_SOURCE_DIR}/include)
@@ -270,7 +326,15 @@ if (BENCHMARK_ENABLE_TESTING)
   if (BENCHMARK_ENABLE_GTEST_TESTS AND
       NOT (TARGET gtest AND TARGET gtest_main AND
            TARGET gmock AND TARGET gmock_main))
-    include(GoogleTest)
+    if (BENCHMARK_USE_BUNDLED_GTEST)
+      include(GoogleTest)
+    else()
+      find_package(GTest CONFIG REQUIRED)
+      add_library(gtest ALIAS GTest::gtest)
+      add_library(gtest_main ALIAS GTest::gtest_main)
+      add_library(gmock ALIAS GTest::gmock)
+      add_library(gmock_main ALIAS GTest::gmock_main)
+    endif()
   endif()
   add_subdirectory(test)
 endif()
diff --git a/ThirdParty/googlebenchmark/CONTRIBUTORS b/ThirdParty/googlebenchmark/CONTRIBUTORS
index b680efc8c4..95bcad019b 100644
--- a/ThirdParty/googlebenchmark/CONTRIBUTORS
+++ b/ThirdParty/googlebenchmark/CONTRIBUTORS
@@ -22,49 +22,72 @@
 #
 # Please keep the list sorted.
 
+Abhina Sreeskantharajan <abhina.sreeskantharajan@ibm.com>
 Albert Pretorius <pretoalb@gmail.com>
 Alex Steele <steelal123@gmail.com>
 Andriy Berestovskyy <berestovskyy@gmail.com>
 Arne Beer <arne@twobeer.de>
+Bátor Tallér <bator.taller@shapr3d.com>
 Billy Robert O'Neal III <billy.oneal@gmail.com> <bion@microsoft.com>
+Cezary Skrzyński <czars1988@gmail.com>
 Chris Kennelly <ckennelly@google.com> <ckennelly@ckennelly.com>
+Christian Wassermann <christian_wassermann@web.de>
 Christopher Seymour <chris.j.seymour@hotmail.com>
+Colin Braley <braley.colin@gmail.com>
 Cyrille Faucheux <cyrille.faucheux@gmail.com>
 Daniel Harvey <danielharvey458@gmail.com>
 David Coeurjolly <david.coeurjolly@liris.cnrs.fr>
 Deniz Evrenci <denizevrenci@gmail.com>
 Dominic Hamon <dma@stripysock.com> <dominic@google.com>
 Dominik Czarnota <dominik.b.czarnota@gmail.com>
+Dominik Korman <kormandominik@gmail.com>
+Donald Aingworth <donalds_junk_mail@yahoo.com>
+Eric Backus <eric_backus@alum.mit.edu>
 Eric Fiselier <eric@efcs.ca>
 Eugene Zhuk <eugene.zhuk@gmail.com>
 Evgeny Safronov <division494@gmail.com>
+Fabien Pichot <pichot.fabien@gmail.com>
+Fanbo Meng <fanbo.meng@ibm.com>
 Federico Ficarelli <federico.ficarelli@gmail.com>
 Felix Homann <linuxaudio@showlabor.de>
+Geoffrey Martin-Noble <gcmn@google.com> <gmngeoffrey@gmail.com>
+Gergő Szitár <szitar.gergo@gmail.com>
 Hannes Hauswedell <h2@fsfe.org>
+Henrique Bucher <hbucher@gmail.com>
 Ismael Jimenez Martinez <ismael.jimenez.martinez@gmail.com>
 Jern-Kuan Leong <jernkuan@gmail.com>
 JianXiong Zhou <zhoujianxiong2@gmail.com>
 Joao Paulo Magalhaes <joaoppmagalhaes@gmail.com>
 John Millikin <jmillikin@stripe.com>
+Jordan Williams <jwillikers@protonmail.com>
 Jussi Knuuttila <jussi.knuuttila@gmail.com>
+Kaito Udagawa <umireon@gmail.com>
 Kai Wolf <kai.wolf@gmail.com>
 Kishan Kumar <kumar.kishan@outlook.com>
-Kaito Udagawa <umireon@gmail.com>
 Lei Xu <eddyxu@gmail.com>
+Marcel Jacobse <mjacobse@uni-bremen.de>
 Matt Clarkson <mattyclarkson@gmail.com>
 Maxim Vafin <maxvafin@gmail.com>
+Mike Apodaca <gatorfax@gmail.com>
+Min-Yih Hsu <yihshyng223@gmail.com>
 Nick Hutchinson <nshutchinson@gmail.com>
+Norman Heino <norman.heino@gmail.com>
 Oleksandr Sochka <sasha.sochka@gmail.com>
 Ori Livneh <ori.livneh@gmail.com>
 Pascal Leroy <phl@google.com>
 Paul Redmond <paul.redmond@gmail.com>
 Pierre Phaneuf <pphaneuf@google.com>
 Radoslav Yovchev <radoslav.tm@gmail.com>
+Raghu Raja <raghu@enfabrica.net>
+Rainer Orth <ro@cebitec.uni-bielefeld.de>
 Raul Marin <rmrodriguez@cartodb.com>
 Ray Glover <ray.glover@uk.ibm.com>
 Robert Guo <robert.guo@mongodb.com>
 Roman Lebedev <lebedev.ri@gmail.com>
+Sayan Bhattacharjee <aero.sayan@gmail.com>
 Shuo Chen <chenshuo@chenshuo.com>
+Steven Wan <wan.yu@ibm.com>
+Tobias Schmidt <tobias.schmidt@in.tum.de>
 Tobias Ulvgård <tobias.ulvgard@dirac.se>
 Tom Madams <tom.ej.madams@gmail.com> <tmadams@google.com>
 Yixuan Qiu <yixuanq@gmail.com>
diff --git a/ThirdParty/googlebenchmark/README.md b/ThirdParty/googlebenchmark/README.md
index 45e4158843..a5e5d392d8 100644
--- a/ThirdParty/googlebenchmark/README.md
+++ b/ThirdParty/googlebenchmark/README.md
@@ -1,9 +1,12 @@
 # Benchmark
-[![Build Status](https://travis-ci.org/google/benchmark.svg?branch=master)](https://travis-ci.org/google/benchmark)
-[![Build status](https://ci.appveyor.com/api/projects/status/u0qsyp7t1tk7cpxs/branch/master?svg=true)](https://ci.appveyor.com/project/google/benchmark/branch/master)
+
+[![build-and-test](https://github.com/google/benchmark/workflows/build-and-test/badge.svg)](https://github.com/google/benchmark/actions?query=workflow%3Abuild-and-test)
+[![bazel](https://github.com/google/benchmark/actions/workflows/bazel.yml/badge.svg)](https://github.com/google/benchmark/actions/workflows/bazel.yml)
+[![pylint](https://github.com/google/benchmark/workflows/pylint/badge.svg)](https://github.com/google/benchmark/actions?query=workflow%3Apylint)
+[![test-bindings](https://github.com/google/benchmark/workflows/test-bindings/badge.svg)](https://github.com/google/benchmark/actions?query=workflow%3Atest-bindings)
 [![Coverage Status](https://coveralls.io/repos/google/benchmark/badge.svg)](https://coveralls.io/r/google/benchmark)
-[![slackin](https://slackin-iqtfqnpzxd.now.sh/badge.svg)](https://slackin-iqtfqnpzxd.now.sh/)
 
+[![Discord](https://discordapp.com/api/guilds/1125694995928719494/widget.png?style=shield)](https://discord.gg/cz7UX7wKC2)
 
 A library to benchmark code snippets, similar to unit tests. Example:
 
@@ -23,23 +26,28 @@ BENCHMARK(BM_SomeFunction);
 BENCHMARK_MAIN();
 ```
 
+## Getting Started
+
 To get started, see [Requirements](#requirements) and
 [Installation](#installation). See [Usage](#usage) for a full example and the
-[User Guide](#user-guide) for a more comprehensive feature overview.
+[User Guide](docs/user_guide.md) for a more comprehensive feature overview.
 
-It may also help to read the [Google Test documentation](https://github.com/google/googletest/blob/master/googletest/docs/primer.md)
+It may also help to read the [Google Test documentation](https://github.com/google/googletest/blob/main/docs/primer.md)
 as some of the structural aspects of the APIs are similar.
 
-### Resources
+## Resources
 
 [Discussion group](https://groups.google.com/d/forum/benchmark-discuss)
 
-IRC channel: [freenode](https://freenode.net) #googlebenchmark
+IRC channels:
+* [libera](https://libera.chat) #benchmark
 
 [Additional Tooling Documentation](docs/tools.md)
 
 [Assembly Testing Documentation](docs/AssemblyTests.md)
 
+[Building and installing Python bindings](docs/python_bindings.md)
+
 ## Requirements
 
 The library can be used with C++03. However, it requires C++11 to build,
@@ -49,62 +57,65 @@ The following minimum versions are required to build the library:
 
 * GCC 4.8
 * Clang 3.4
-* Visual Studio 2013
+* Visual Studio 14 2015
 * Intel 2015 Update 1
 
+See [Platform-Specific Build Instructions](docs/platform_specific_build_instructions.md).
+
 ## Installation
 
 This describes the installation process using cmake. As pre-requisites, you'll
 need git and cmake installed.
 
-_See [dependencies.md](dependencies.md) for more details regarding supported
+_See [dependencies.md](docs/dependencies.md) for more details regarding supported
 versions of build tools._
 
 ```bash
 # Check out the library.
 $ git clone https://github.com/google/benchmark.git
-# Benchmark requires Google Test as a dependency. Add the source tree as a subdirectory.
-$ git clone https://github.com/google/googletest.git benchmark/googletest
+# Go to the library root directory
+$ cd benchmark
 # Make a build directory to place the build output.
-$ mkdir build && cd build
-# Generate a Makefile with cmake.
-# Use cmake -G <generator> to generate a different file type.
-$ cmake ../benchmark
+$ cmake -E make_directory "build"
+# Generate build system files with cmake, and download any dependencies.
+$ cmake -E chdir "build" cmake -DBENCHMARK_DOWNLOAD_DEPENDENCIES=on -DCMAKE_BUILD_TYPE=Release ../
+# or, starting with CMake 3.13, use a simpler form:
+# cmake -DCMAKE_BUILD_TYPE=Release -S . -B "build"
 # Build the library.
-$ make
+$ cmake --build "build" --config Release
 ```
 This builds the `benchmark` and `benchmark_main` libraries and tests.
 On a unix system, the build directory should now look something like this:
 
 ```
 /benchmark
-/build
-  /src
-    /libbenchmark.a
-    /libbenchmark_main.a
-  /test
-    ...
+  /build
+    /src
+      /libbenchmark.a
+      /libbenchmark_main.a
+    /test
+      ...
 ```
 
 Next, you can run the tests to check the build.
 
 ```bash
-$ make test
+$ cmake -E chdir "build" ctest --build-config Release
 ```
 
 If you want to install the library globally, also run:
 
 ```
-sudo make install
+sudo cmake --build "build" --config Release --target install
 ```
 
 Note that Google Benchmark requires Google Test to build and run the tests. This
 dependency can be provided two ways:
 
-* Checkout the Google Test sources into `benchmark/googletest` as above.
+* Checkout the Google Test sources into `benchmark/googletest`.
 * Otherwise, if `-DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON` is specified during
-  configuration, the library will automatically download and build any required
-  dependencies.
+  configuration as above, the library will automatically download and build
+  any required dependencies.
 
 If you do not wish to build and run the tests, add `-DBENCHMARK_ENABLE_GTEST_TESTS=OFF`
 to `CMAKE_ARGS`.
@@ -112,17 +123,14 @@ to `CMAKE_ARGS`.
 ### Debug vs Release
 
 By default, benchmark builds as a debug library. You will see a warning in the
-output when this is the case. To build it as a release library instead, use:
+output when this is the case. To build it as a release library instead, add
+`-DCMAKE_BUILD_TYPE=Release` when generating the build system files, as shown
+above. The use of `--config Release` in build commands is needed to properly
+support multi-configuration tools (like Visual Studio for example) and can be
+skipped for other build systems (like Makefile).
 
-```
-cmake -DCMAKE_BUILD_TYPE=Release
-```
-
-To enable link-time optimisation, use
-
-```
-cmake -DCMAKE_BUILD_TYPE=Release -DBENCHMARK_ENABLE_LTO=true
-```
+To enable link-time optimisation, also add `-DBENCHMARK_ENABLE_LTO=true` when
+generating the build system files.
 
 If you are using gcc, you might need to set `GCC_AR` and `GCC_RANLIB` cmake
 cache variables, if autodetection fails.
@@ -130,6 +138,11 @@ cache variables, if autodetection fails.
 If you are using clang, you may need to set `LLVMAR_EXECUTABLE`,
 `LLVMNM_EXECUTABLE` and `LLVMRANLIB_EXECUTABLE` cmake cache variables.
 
+To enable sanitizer checks (eg., `asan` and `tsan`), add:
+```
+ -DCMAKE_C_FLAGS="-g -O2 -fno-omit-frame-pointer -fsanitize=address -fsanitize=thread -fno-sanitize-recover=all"
+ -DCMAKE_CXX_FLAGS="-g -O2 -fno-omit-frame-pointer -fsanitize=address -fsanitize=thread -fno-sanitize-recover=all "  
+```
 
 ### Stable and Experimental Library Versions
 
@@ -144,7 +157,9 @@ this branch. However, this branch provides no stability guarantees and reserves
 the right to change and break the API at any time.
 
 ## Usage
+
 ### Basic usage
+
 Define a function that executes the code to measure, register it as a benchmark
 function using the `BENCHMARK` macro, and ensure an appropriate `main` function
 is available:
@@ -171,1009 +186,38 @@ BENCHMARK_MAIN();
 ```
 
 To run the benchmark, compile and link against the `benchmark` library
-(libbenchmark.a/.so). If you followed the build steps above, this
-library will be under the build directory you created.
+(libbenchmark.a/.so). If you followed the build steps above, this library will 
+be under the build directory you created.
 
 ```bash
 # Example on linux after running the build steps above. Assumes the
 # `benchmark` and `build` directories are under the current directory.
-$ g++ -std=c++11 -isystem benchmark/include -Lbuild/src -lpthread \
-  -lbenchmark mybenchmark.cc -o mybenchmark
+$ g++ mybenchmark.cc -std=c++11 -isystem benchmark/include \
+  -Lbenchmark/build/src -lbenchmark -lpthread -o mybenchmark
 ```
 
 Alternatively, link against the `benchmark_main` library and remove
 `BENCHMARK_MAIN();` above to get the same behavior.
 
 The compiled executable will run all benchmarks by default. Pass the `--help`
-flag for option information or see the guide below.
-
-### Platform-specific instructions
-
-When the library is built using GCC it is necessary to link with the pthread
-library due to how GCC implements `std::thread`. Failing to link to pthread will
-lead to runtime exceptions (unless you're using libc++), not linker errors. See
-[issue #67](https://github.com/google/benchmark/issues/67) for more details. You
-can link to pthread by adding `-pthread` to your linker command. Note, you can
-also use `-lpthread`, but there are potential issues with ordering of command
-line parameters if you use that.
-
-If you're running benchmarks on Windows, the shlwapi library (`-lshlwapi`) is
-also required.
-
-If you're running benchmarks on solaris, you'll want the kstat library linked in
-too (`-lkstat`).
-
-## User Guide
-
-### Command Line
-[Output Formats](#output-formats)
-
-[Output Files](#output-files)
-
-[Running a Subset of Benchmarks](#running-a-subset-of-benchmarks)
-
-[Result Comparison](#result-comparison)
-
-### Library
-[Runtime and Reporting Considerations](#runtime-and-reporting-considerations)
-
-[Passing Arguments](#passing-arguments)
-
-[Calculating Asymptotic Complexity](#asymptotic-complexity)
-
-[Templated Benchmarks](#templated-benchmarks)
-
-[Fixtures](#fixtures)
-
-[Custom Counters](#custom-counters)
-
-[Multithreaded Benchmarks](#multithreaded-benchmarks)
-
-[CPU Timers](#cpu-timers)
-
-[Manual Timing](#manual-timing)
-
-[Setting the Time Unit](#setting-the-time-unit)
-
-[Preventing Optimization](#preventing-optimization)
-
-[Reporting Statistics](#reporting-statistics)
-
-[Custom Statistics](#custom-statistics)
-
-[Using RegisterBenchmark](#using-register-benchmark)
-
-[Exiting with an Error](#exiting-with-an-error)
-
-[A Faster KeepRunning Loop](#a-faster-keep-running-loop)
-
-[Disabling CPU Frequency Scaling](#disabling-cpu-frequency-scaling)
-
-<a name="output-formats" />
-
-### Output Formats
-
-The library supports multiple output formats. Use the
-`--benchmark_format=<console|json|csv>` flag to set the format type. `console`
-is the default format.
-
-The Console format is intended to be a human readable format. By default
-the format generates color output. Context is output on stderr and the
-tabular data on stdout. Example tabular output looks like:
-```
-Benchmark                               Time(ns)    CPU(ns) Iterations
-----------------------------------------------------------------------
-BM_SetInsert/1024/1                        28928      29349      23853  133.097kB/s   33.2742k items/s
-BM_SetInsert/1024/8                        32065      32913      21375  949.487kB/s   237.372k items/s
-BM_SetInsert/1024/10                       33157      33648      21431  1.13369MB/s   290.225k items/s
-```
-
-The JSON format outputs human readable json split into two top level attributes.
-The `context` attribute contains information about the run in general, including
-information about the CPU and the date.
-The `benchmarks` attribute contains a list of every benchmark run. Example json
-output looks like:
-```json
-{
-  "context": {
-    "date": "2015/03/17-18:40:25",
-    "num_cpus": 40,
-    "mhz_per_cpu": 2801,
-    "cpu_scaling_enabled": false,
-    "build_type": "debug"
-  },
-  "benchmarks": [
-    {
-      "name": "BM_SetInsert/1024/1",
-      "iterations": 94877,
-      "real_time": 29275,
-      "cpu_time": 29836,
-      "bytes_per_second": 134066,
-      "items_per_second": 33516
-    },
-    {
-      "name": "BM_SetInsert/1024/8",
-      "iterations": 21609,
-      "real_time": 32317,
-      "cpu_time": 32429,
-      "bytes_per_second": 986770,
-      "items_per_second": 246693
-    },
-    {
-      "name": "BM_SetInsert/1024/10",
-      "iterations": 21393,
-      "real_time": 32724,
-      "cpu_time": 33355,
-      "bytes_per_second": 1199226,
-      "items_per_second": 299807
-    }
-  ]
-}
-```
-
-The CSV format outputs comma-separated values. The `context` is output on stderr
-and the CSV itself on stdout. Example CSV output looks like:
-```
-name,iterations,real_time,cpu_time,bytes_per_second,items_per_second,label
-"BM_SetInsert/1024/1",65465,17890.7,8407.45,475768,118942,
-"BM_SetInsert/1024/8",116606,18810.1,9766.64,3.27646e+06,819115,
-"BM_SetInsert/1024/10",106365,17238.4,8421.53,4.74973e+06,1.18743e+06,
-```
-
-<a name="output-files" />
-
-### Output Files
-
-Write benchmark results to a file with the `--benchmark_out=<filename>` option.
-Specify the output format with `--benchmark_out_format={json|console|csv}`. Note that Specifying
-`--benchmark_out` does not suppress the console output.
-
-<a name="running-a-subset-of-benchmarks" />
-
-### Running a Subset of Benchmarks
-
-The `--benchmark_filter=<regex>` option can be used to only run the benchmarks
-which match the specified `<regex>`. For example:
-
-```bash
-$ ./run_benchmarks.x --benchmark_filter=BM_memcpy/32
-Run on (1 X 2300 MHz CPU )
-2016-06-25 19:34:24
-Benchmark              Time           CPU Iterations
-----------------------------------------------------
-BM_memcpy/32          11 ns         11 ns   79545455
-BM_memcpy/32k       2181 ns       2185 ns     324074
-BM_memcpy/32          12 ns         12 ns   54687500
-BM_memcpy/32k       1834 ns       1837 ns     357143
-```
-
-<a name="result-comparison" />
-
-### Result comparison
-
-It is possible to compare the benchmarking results. See [Additional Tooling Documentation](docs/tools.md)
-
-<a name="runtime-and-reporting-considerations" />
-
-### Runtime and Reporting Considerations
-
-When the benchmark binary is executed, each benchmark function is run serially.
-The number of iterations to run is determined dynamically by running the
-benchmark a few times and measuring the time taken and ensuring that the
-ultimate result will be statistically stable. As such, faster benchmark
-functions will be run for more iterations than slower benchmark functions, and
-the number of iterations is thus reported.
-
-In all cases, the number of iterations for which the benchmark is run is
-governed by the amount of time the benchmark takes. Concretely, the number of
-iterations is at least one, not more than 1e9, until CPU time is greater than
-the minimum time, or the wallclock time is 5x minimum time. The minimum time is
-set per benchmark by calling `MinTime` on the registered benchmark object.
-
-Average timings are then reported over the iterations run. If multiple
-repetitions are requested using the `--benchmark_repetitions` command-line
-option, or at registration time, the benchmark function will be run several
-times and statistical results across these repetitions will also be reported.
-
-As well as the per-benchmark entries, a preamble in the report will include
-information about the machine on which the benchmarks are run.
-
-<a name="passing-arguments" />
-
-### Passing Arguments
-
-Sometimes a family of benchmarks can be implemented with just one routine that
-takes an extra argument to specify which one of the family of benchmarks to
-run. For example, the following code defines a family of benchmarks for
-measuring the speed of `memcpy()` calls of different lengths:
-
-```c++
-static void BM_memcpy(benchmark::State& state) {
-  char* src = new char[state.range(0)];
-  char* dst = new char[state.range(0)];
-  memset(src, 'x', state.range(0));
-  for (auto _ : state)
-    memcpy(dst, src, state.range(0));
-  state.SetBytesProcessed(int64_t(state.iterations()) *
-                          int64_t(state.range(0)));
-  delete[] src;
-  delete[] dst;
-}
-BENCHMARK(BM_memcpy)->Arg(8)->Arg(64)->Arg(512)->Arg(1<<10)->Arg(8<<10);
-```
-
-The preceding code is quite repetitive, and can be replaced with the following
-short-hand. The following invocation will pick a few appropriate arguments in
-the specified range and will generate a benchmark for each such argument.
-
-```c++
-BENCHMARK(BM_memcpy)->Range(8, 8<<10);
-```
-
-By default the arguments in the range are generated in multiples of eight and
-the command above selects [ 8, 64, 512, 4k, 8k ]. In the following code the
-range multiplier is changed to multiples of two.
-
-```c++
-BENCHMARK(BM_memcpy)->RangeMultiplier(2)->Range(8, 8<<10);
-```
-Now arguments generated are [ 8, 16, 32, 64, 128, 256, 512, 1024, 2k, 4k, 8k ].
-
-You might have a benchmark that depends on two or more inputs. For example, the
-following code defines a family of benchmarks for measuring the speed of set
-insertion.
-
-```c++
-static void BM_SetInsert(benchmark::State& state) {
-  std::set<int> data;
-  for (auto _ : state) {
-    state.PauseTiming();
-    data = ConstructRandomSet(state.range(0));
-    state.ResumeTiming();
-    for (int j = 0; j < state.range(1); ++j)
-      data.insert(RandomNumber());
-  }
-}
-BENCHMARK(BM_SetInsert)
-    ->Args({1<<10, 128})
-    ->Args({2<<10, 128})
-    ->Args({4<<10, 128})
-    ->Args({8<<10, 128})
-    ->Args({1<<10, 512})
-    ->Args({2<<10, 512})
-    ->Args({4<<10, 512})
-    ->Args({8<<10, 512});
-```
-
-The preceding code is quite repetitive, and can be replaced with the following
-short-hand. The following macro will pick a few appropriate arguments in the
-product of the two specified ranges and will generate a benchmark for each such
-pair.
-
-```c++
-BENCHMARK(BM_SetInsert)->Ranges({{1<<10, 8<<10}, {128, 512}});
-```
-
-For more complex patterns of inputs, passing a custom function to `Apply` allows
-programmatic specification of an arbitrary set of arguments on which to run the
-benchmark. The following example enumerates a dense range on one parameter,
-and a sparse range on the second.
-
-```c++
-static void CustomArguments(benchmark::internal::Benchmark* b) {
-  for (int i = 0; i <= 10; ++i)
-    for (int j = 32; j <= 1024*1024; j *= 8)
-      b->Args({i, j});
-}
-BENCHMARK(BM_SetInsert)->Apply(CustomArguments);
-```
-
-#### Passing Arbitrary Arguments to a Benchmark
-
-In C++11 it is possible to define a benchmark that takes an arbitrary number
-of extra arguments. The `BENCHMARK_CAPTURE(func, test_case_name, ...args)`
-macro creates a benchmark that invokes `func`  with the `benchmark::State` as
-the first argument followed by the specified `args...`.
-The `test_case_name` is appended to the name of the benchmark and
-should describe the values passed.
-
-```c++
-template <class ...ExtraArgs>
-void BM_takes_args(benchmark::State& state, ExtraArgs&&... extra_args) {
-  [...]
-}
-// Registers a benchmark named "BM_takes_args/int_string_test" that passes
-// the specified values to `extra_args`.
-BENCHMARK_CAPTURE(BM_takes_args, int_string_test, 42, std::string("abc"));
-```
-Note that elements of `...args` may refer to global variables. Users should
-avoid modifying global state inside of a benchmark.
-
-<a name="asymptotic-complexity" />
-
-### Calculating Asymptotic Complexity (Big O)
-
-Asymptotic complexity might be calculated for a family of benchmarks. The
-following code will calculate the coefficient for the high-order term in the
-running time and the normalized root-mean square error of string comparison.
-
-```c++
-static void BM_StringCompare(benchmark::State& state) {
-  std::string s1(state.range(0), '-');
-  std::string s2(state.range(0), '-');
-  for (auto _ : state) {
-    benchmark::DoNotOptimize(s1.compare(s2));
-  }
-  state.SetComplexityN(state.range(0));
-}
-BENCHMARK(BM_StringCompare)
-    ->RangeMultiplier(2)->Range(1<<10, 1<<18)->Complexity(benchmark::oN);
-```
-
-As shown in the following invocation, asymptotic complexity might also be
-calculated automatically.
-
-```c++
-BENCHMARK(BM_StringCompare)
-    ->RangeMultiplier(2)->Range(1<<10, 1<<18)->Complexity();
-```
-
-The following code will specify asymptotic complexity with a lambda function,
-that might be used to customize high-order term calculation.
-
-```c++
-BENCHMARK(BM_StringCompare)->RangeMultiplier(2)
-    ->Range(1<<10, 1<<18)->Complexity([](int64_t n)->double{return n; });
-```
-
-<a name="templated-benchmarks" />
-
-### Templated Benchmarks
-
-This example produces and consumes messages of size `sizeof(v)` `range_x`
-times. It also outputs throughput in the absence of multiprogramming.
-
-```c++
-template <class Q> void BM_Sequential(benchmark::State& state) {
-  Q q;
-  typename Q::value_type v;
-  for (auto _ : state) {
-    for (int i = state.range(0); i--; )
-      q.push(v);
-    for (int e = state.range(0); e--; )
-      q.Wait(&v);
-  }
-  // actually messages, not bytes:
-  state.SetBytesProcessed(
-      static_cast<int64_t>(state.iterations())*state.range(0));
-}
-BENCHMARK_TEMPLATE(BM_Sequential, WaitQueue<int>)->Range(1<<0, 1<<10);
-```
-
-Three macros are provided for adding benchmark templates.
-
-```c++
-#ifdef BENCHMARK_HAS_CXX11
-#define BENCHMARK_TEMPLATE(func, ...) // Takes any number of parameters.
-#else // C++ < C++11
-#define BENCHMARK_TEMPLATE(func, arg1)
-#endif
-#define BENCHMARK_TEMPLATE1(func, arg1)
-#define BENCHMARK_TEMPLATE2(func, arg1, arg2)
-```
-
-<a name="fixtures" />
-
-### Fixtures
-
-Fixture tests are created by first defining a type that derives from
-`::benchmark::Fixture` and then creating/registering the tests using the
-following macros:
-
-* `BENCHMARK_F(ClassName, Method)`
-* `BENCHMARK_DEFINE_F(ClassName, Method)`
-* `BENCHMARK_REGISTER_F(ClassName, Method)`
-
-For Example:
-
-```c++
-class MyFixture : public benchmark::Fixture {
-public:
-  void SetUp(const ::benchmark::State& state) {
-  }
-
-  void TearDown(const ::benchmark::State& state) {
-  }
-};
-
-BENCHMARK_F(MyFixture, FooTest)(benchmark::State& st) {
-   for (auto _ : st) {
-     ...
-  }
-}
-
-BENCHMARK_DEFINE_F(MyFixture, BarTest)(benchmark::State& st) {
-   for (auto _ : st) {
-     ...
-  }
-}
-/* BarTest is NOT registered */
-BENCHMARK_REGISTER_F(MyFixture, BarTest)->Threads(2);
-/* BarTest is now registered */
-```
-
-#### Templated Fixtures
-
-Also you can create templated fixture by using the following macros:
-
-* `BENCHMARK_TEMPLATE_F(ClassName, Method, ...)`
-* `BENCHMARK_TEMPLATE_DEFINE_F(ClassName, Method, ...)`
-
-For example:
-```c++
-template<typename T>
-class MyFixture : public benchmark::Fixture {};
-
-BENCHMARK_TEMPLATE_F(MyFixture, IntTest, int)(benchmark::State& st) {
-   for (auto _ : st) {
-     ...
-  }
-}
-
-BENCHMARK_TEMPLATE_DEFINE_F(MyFixture, DoubleTest, double)(benchmark::State& st) {
-   for (auto _ : st) {
-     ...
-  }
-}
-
-BENCHMARK_REGISTER_F(MyFixture, DoubleTest)->Threads(2);
-```
-
-<a name="custom-counters" />
-
-### Custom Counters
-
-You can add your own counters with user-defined names. The example below
-will add columns "Foo", "Bar" and "Baz" in its output:
-
-```c++
-static void UserCountersExample1(benchmark::State& state) {
-  double numFoos = 0, numBars = 0, numBazs = 0;
-  for (auto _ : state) {
-    // ... count Foo,Bar,Baz events
-  }
-  state.counters["Foo"] = numFoos;
-  state.counters["Bar"] = numBars;
-  state.counters["Baz"] = numBazs;
-}
-```
-
-The `state.counters` object is a `std::map` with `std::string` keys
-and `Counter` values. The latter is a `double`-like class, via an implicit
-conversion to `double&`. Thus you can use all of the standard arithmetic
-assignment operators (`=,+=,-=,*=,/=`) to change the value of each counter.
-
-In multithreaded benchmarks, each counter is set on the calling thread only.
-When the benchmark finishes, the counters from each thread will be summed;
-the resulting sum is the value which will be shown for the benchmark.
-
-The `Counter` constructor accepts three parameters: the value as a `double`
-; a bit flag which allows you to show counters as rates, and/or as per-thread
-iteration, and/or as per-thread averages, and/or iteration invariants;
-and a flag specifying the 'unit' - i.e. is 1k a 1000 (default,
-`benchmark::Counter::OneK::kIs1000`), or 1024
-(`benchmark::Counter::OneK::kIs1024`)?
-
-```c++
-  // sets a simple counter
-  state.counters["Foo"] = numFoos;
-
-  // Set the counter as a rate. It will be presented divided
-  // by the duration of the benchmark.
-  state.counters["FooRate"] = Counter(numFoos, benchmark::Counter::kIsRate);
-
-  // Set the counter as a thread-average quantity. It will
-  // be presented divided by the number of threads.
-  state.counters["FooAvg"] = Counter(numFoos, benchmark::Counter::kAvgThreads);
-
-  // There's also a combined flag:
-  state.counters["FooAvgRate"] = Counter(numFoos,benchmark::Counter::kAvgThreadsRate);
-
-  // This says that we process with the rate of state.range(0) bytes every iteration:
-  state.counters["BytesProcessed"] = Counter(state.range(0), benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::OneK::kIs1024);
-```
-
-When you're compiling in C++11 mode or later you can use `insert()` with
-`std::initializer_list`:
-
-```c++
-  // With C++11, this can be done:
-  state.counters.insert({{"Foo", numFoos}, {"Bar", numBars}, {"Baz", numBazs}});
-  // ... instead of:
-  state.counters["Foo"] = numFoos;
-  state.counters["Bar"] = numBars;
-  state.counters["Baz"] = numBazs;
-```
-
-#### Counter Reporting
-
-When using the console reporter, by default, user counters are are printed at
-the end after the table, the same way as ``bytes_processed`` and
-``items_processed``. This is best for cases in which there are few counters,
-or where there are only a couple of lines per benchmark. Here's an example of
-the default output:
-
-```
-------------------------------------------------------------------------------
-Benchmark                        Time           CPU Iterations UserCounters...
-------------------------------------------------------------------------------
-BM_UserCounter/threads:8      2248 ns      10277 ns      68808 Bar=16 Bat=40 Baz=24 Foo=8
-BM_UserCounter/threads:1      9797 ns       9788 ns      71523 Bar=2 Bat=5 Baz=3 Foo=1024m
-BM_UserCounter/threads:2      4924 ns       9842 ns      71036 Bar=4 Bat=10 Baz=6 Foo=2
-BM_UserCounter/threads:4      2589 ns      10284 ns      68012 Bar=8 Bat=20 Baz=12 Foo=4
-BM_UserCounter/threads:8      2212 ns      10287 ns      68040 Bar=16 Bat=40 Baz=24 Foo=8
-BM_UserCounter/threads:16     1782 ns      10278 ns      68144 Bar=32 Bat=80 Baz=48 Foo=16
-BM_UserCounter/threads:32     1291 ns      10296 ns      68256 Bar=64 Bat=160 Baz=96 Foo=32
-BM_UserCounter/threads:4      2615 ns      10307 ns      68040 Bar=8 Bat=20 Baz=12 Foo=4
-BM_Factorial                    26 ns         26 ns   26608979 40320
-BM_Factorial/real_time          26 ns         26 ns   26587936 40320
-BM_CalculatePiRange/1           16 ns         16 ns   45704255 0
-BM_CalculatePiRange/8           73 ns         73 ns    9520927 3.28374
-BM_CalculatePiRange/64         609 ns        609 ns    1140647 3.15746
-BM_CalculatePiRange/512       4900 ns       4901 ns     142696 3.14355
-```
-
-If this doesn't suit you, you can print each counter as a table column by
-passing the flag `--benchmark_counters_tabular=true` to the benchmark
-application. This is best for cases in which there are a lot of counters, or
-a lot of lines per individual benchmark. Note that this will trigger a
-reprinting of the table header any time the counter set changes between
-individual benchmarks. Here's an example of corresponding output when
-`--benchmark_counters_tabular=true` is passed:
-
-```
----------------------------------------------------------------------------------------
-Benchmark                        Time           CPU Iterations    Bar   Bat   Baz   Foo
----------------------------------------------------------------------------------------
-BM_UserCounter/threads:8      2198 ns       9953 ns      70688     16    40    24     8
-BM_UserCounter/threads:1      9504 ns       9504 ns      73787      2     5     3     1
-BM_UserCounter/threads:2      4775 ns       9550 ns      72606      4    10     6     2
-BM_UserCounter/threads:4      2508 ns       9951 ns      70332      8    20    12     4
-BM_UserCounter/threads:8      2055 ns       9933 ns      70344     16    40    24     8
-BM_UserCounter/threads:16     1610 ns       9946 ns      70720     32    80    48    16
-BM_UserCounter/threads:32     1192 ns       9948 ns      70496     64   160    96    32
-BM_UserCounter/threads:4      2506 ns       9949 ns      70332      8    20    12     4
---------------------------------------------------------------
-Benchmark                        Time           CPU Iterations
---------------------------------------------------------------
-BM_Factorial                    26 ns         26 ns   26392245 40320
-BM_Factorial/real_time          26 ns         26 ns   26494107 40320
-BM_CalculatePiRange/1           15 ns         15 ns   45571597 0
-BM_CalculatePiRange/8           74 ns         74 ns    9450212 3.28374
-BM_CalculatePiRange/64         595 ns        595 ns    1173901 3.15746
-BM_CalculatePiRange/512       4752 ns       4752 ns     147380 3.14355
-BM_CalculatePiRange/4k       37970 ns      37972 ns      18453 3.14184
-BM_CalculatePiRange/32k     303733 ns     303744 ns       2305 3.14162
-BM_CalculatePiRange/256k   2434095 ns    2434186 ns        288 3.1416
-BM_CalculatePiRange/1024k  9721140 ns    9721413 ns         71 3.14159
-BM_CalculatePi/threads:8      2255 ns       9943 ns      70936
-```
-Note above the additional header printed when the benchmark changes from
-``BM_UserCounter`` to ``BM_Factorial``. This is because ``BM_Factorial`` does
-not have the same counter set as ``BM_UserCounter``.
-
-<a name="multithreaded-benchmarks"/>
-
-### Multithreaded Benchmarks
-
-In a multithreaded test (benchmark invoked by multiple threads simultaneously),
-it is guaranteed that none of the threads will start until all have reached
-the start of the benchmark loop, and all will have finished before any thread
-exits the benchmark loop. (This behavior is also provided by the `KeepRunning()`
-API) As such, any global setup or teardown can be wrapped in a check against the thread
-index:
-
-```c++
-static void BM_MultiThreaded(benchmark::State& state) {
-  if (state.thread_index == 0) {
-    // Setup code here.
-  }
-  for (auto _ : state) {
-    // Run the test as normal.
-  }
-  if (state.thread_index == 0) {
-    // Teardown code here.
-  }
-}
-BENCHMARK(BM_MultiThreaded)->Threads(2);
-```
-
-If the benchmarked code itself uses threads and you want to compare it to
-single-threaded code, you may want to use real-time ("wallclock") measurements
-for latency comparisons:
-
-```c++
-BENCHMARK(BM_test)->Range(8, 8<<10)->UseRealTime();
-```
-
-Without `UseRealTime`, CPU time is used by default.
-
-<a name="cpu-timers" />
-
-### CPU Timers
-
-By default, the CPU timer only measures the time spent by the main thread.
-If the benchmark itself uses threads internally, this measurement may not
-be what you are looking for. Instead, there is a way to measure the total
-CPU usage of the process, by all the threads.
-
-```c++
-void callee(int i);
-
-static void MyMain(int size) {
-#pragma omp parallel for
-  for(int i = 0; i < size; i++)
-    callee(i);
-}
-
-static void BM_OpenMP(benchmark::State& state) {
-  for (auto _ : state)
-    MyMain(state.range(0);
-}
-
-// Measure the time spent by the main thread, use it to decide for how long to
-// run the benchmark loop. Depending on the internal implementation detail may
-// measure to anywhere from near-zero (the overhead spent before/after work
-// handoff to worker thread[s]) to the whole single-thread time.
-BENCHMARK(BM_OpenMP)->Range(8, 8<<10);
-
-// Measure the user-visible time, the wall clock (literally, the time that
-// has passed on the clock on the wall), use it to decide for how long to
-// run the benchmark loop. This will always be meaningful, an will match the
-// time spent by the main thread in single-threaded case, in general decreasing
-// with the number of internal threads doing the work.
-BENCHMARK(BM_OpenMP)->Range(8, 8<<10)->UseRealTime();
-
-// Measure the total CPU consumption, use it to decide for how long to
-// run the benchmark loop. This will always measure to no less than the
-// time spent by the main thread in single-threaded case.
-BENCHMARK(BM_OpenMP)->Range(8, 8<<10)->MeasureProcessCPUTime();
-
-// A mixture of the last two. Measure the total CPU consumption, but use the
-// wall clock to decide for how long to run the benchmark loop.
-BENCHMARK(BM_OpenMP)->Range(8, 8<<10)->MeasureProcessCPUTime()->UseRealTime();
-```
-
-#### Controlling Timers
-
-Normally, the entire duration of the work loop (`for (auto _ : state) {}`)
-is measured. But sometimes, it is necessary to do some work inside of
-that loop, every iteration, but without counting that time to the benchmark time.
-That is possible, althought it is not recommended, since it has high overhead.
-
-```c++
-static void BM_SetInsert_With_Timer_Control(benchmark::State& state) {
-  std::set<int> data;
-  for (auto _ : state) {
-    state.PauseTiming(); // Stop timers. They will not count until they are resumed.
-    data = ConstructRandomSet(state.range(0)); // Do something that should not be measured
-    state.ResumeTiming(); // And resume timers. They are now counting again.
-    // The rest will be measured.
-    for (int j = 0; j < state.range(1); ++j)
-      data.insert(RandomNumber());
-  }
-}
-BENCHMARK(BM_SetInsert_With_Timer_Control)->Ranges({{1<<10, 8<<10}, {128, 512}});
-```
-
-<a name="manual-timing" />
-
-### Manual Timing
-
-For benchmarking something for which neither CPU time nor real-time are
-correct or accurate enough, completely manual timing is supported using
-the `UseManualTime` function.
-
-When `UseManualTime` is used, the benchmarked code must call
-`SetIterationTime` once per iteration of the benchmark loop to
-report the manually measured time.
-
-An example use case for this is benchmarking GPU execution (e.g. OpenCL
-or CUDA kernels, OpenGL or Vulkan or Direct3D draw calls), which cannot
-be accurately measured using CPU time or real-time. Instead, they can be
-measured accurately using a dedicated API, and these measurement results
-can be reported back with `SetIterationTime`.
-
-```c++
-static void BM_ManualTiming(benchmark::State& state) {
-  int microseconds = state.range(0);
-  std::chrono::duration<double, std::micro> sleep_duration {
-    static_cast<double>(microseconds)
-  };
-
-  for (auto _ : state) {
-    auto start = std::chrono::high_resolution_clock::now();
-    // Simulate some useful workload with a sleep
-    std::this_thread::sleep_for(sleep_duration);
-    auto end   = std::chrono::high_resolution_clock::now();
-
-    auto elapsed_seconds =
-      std::chrono::duration_cast<std::chrono::duration<double>>(
-        end - start);
-
-    state.SetIterationTime(elapsed_seconds.count());
-  }
-}
-BENCHMARK(BM_ManualTiming)->Range(1, 1<<17)->UseManualTime();
-```
-
-<a name="setting-the-time-unit" />
-
-### Setting the Time Unit
-
-If a benchmark runs a few milliseconds it may be hard to visually compare the
-measured times, since the output data is given in nanoseconds per default. In
-order to manually set the time unit, you can specify it manually:
-
-```c++
-BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
-```
-
-<a name="preventing-optimization" />
-
-### Preventing Optimization
-
-To prevent a value or expression from being optimized away by the compiler
-the `benchmark::DoNotOptimize(...)` and `benchmark::ClobberMemory()`
-functions can be used.
-
-```c++
-static void BM_test(benchmark::State& state) {
-  for (auto _ : state) {
-      int x = 0;
-      for (int i=0; i < 64; ++i) {
-        benchmark::DoNotOptimize(x += i);
-      }
-  }
-}
-```
-
-`DoNotOptimize(<expr>)` forces the  *result* of `<expr>` to be stored in either
-memory or a register. For GNU based compilers it acts as read/write barrier
-for global memory. More specifically it forces the compiler to flush pending
-writes to memory and reload any other values as necessary.
-
-Note that `DoNotOptimize(<expr>)` does not prevent optimizations on `<expr>`
-in any way. `<expr>` may even be removed entirely when the result is already
-known. For example:
-
-```c++
-  /* Example 1: `<expr>` is removed entirely. */
-  int foo(int x) { return x + 42; }
-  while (...) DoNotOptimize(foo(0)); // Optimized to DoNotOptimize(42);
-
-  /*  Example 2: Result of '<expr>' is only reused */
-  int bar(int) __attribute__((const));
-  while (...) DoNotOptimize(bar(0)); // Optimized to:
-  // int __result__ = bar(0);
-  // while (...) DoNotOptimize(__result__);
-```
-
-The second tool for preventing optimizations is `ClobberMemory()`. In essence
-`ClobberMemory()` forces the compiler to perform all pending writes to global
-memory. Memory managed by block scope objects must be "escaped" using
-`DoNotOptimize(...)` before it can be clobbered. In the below example
-`ClobberMemory()` prevents the call to `v.push_back(42)` from being optimized
-away.
-
-```c++
-static void BM_vector_push_back(benchmark::State& state) {
-  for (auto _ : state) {
-    std::vector<int> v;
-    v.reserve(1);
-    benchmark::DoNotOptimize(v.data()); // Allow v.data() to be clobbered.
-    v.push_back(42);
-    benchmark::ClobberMemory(); // Force 42 to be written to memory.
-  }
-}
-```
-
-Note that `ClobberMemory()` is only available for GNU or MSVC based compilers.
-
-<a name="reporting-statistics" />
-
-### Statistics: Reporting the Mean, Median and Standard Deviation of Repeated Benchmarks
-
-By default each benchmark is run once and that single result is reported.
-However benchmarks are often noisy and a single result may not be representative
-of the overall behavior. For this reason it's possible to repeatedly rerun the
-benchmark.
-
-The number of runs of each benchmark is specified globally by the
-`--benchmark_repetitions` flag or on a per benchmark basis by calling
-`Repetitions` on the registered benchmark object. When a benchmark is run more
-than once the mean, median and standard deviation of the runs will be reported.
-
-Additionally the `--benchmark_report_aggregates_only={true|false}`,
-`--benchmark_display_aggregates_only={true|false}` flags or
-`ReportAggregatesOnly(bool)`, `DisplayAggregatesOnly(bool)` functions can be
-used to change how repeated tests are reported. By default the result of each
-repeated run is reported. When `report aggregates only` option is `true`,
-only the aggregates (i.e. mean, median and standard deviation, maybe complexity
-measurements if they were requested) of the runs is reported, to both the
-reporters - standard output (console), and the file.
-However when only the `display aggregates only` option is `true`,
-only the aggregates are displayed in the standard output, while the file
-output still contains everything.
-Calling `ReportAggregatesOnly(bool)` / `DisplayAggregatesOnly(bool)` on a
-registered benchmark object overrides the value of the appropriate flag for that
-benchmark.
-
-<a name="custom-statistics" />
-
-### Custom Statistics
-
-While having mean, median and standard deviation is nice, this may not be
-enough for everyone. For example you may want to know what the largest
-observation is, e.g. because you have some real-time constraints. This is easy.
-The following code will specify a custom statistic to be calculated, defined
-by a lambda function.
-
-```c++
-void BM_spin_empty(benchmark::State& state) {
-  for (auto _ : state) {
-    for (int x = 0; x < state.range(0); ++x) {
-      benchmark::DoNotOptimize(x);
-    }
-  }
-}
-
-BENCHMARK(BM_spin_empty)
-  ->ComputeStatistics("max", [](const std::vector<double>& v) -> double {
-    return *(std::max_element(std::begin(v), std::end(v)));
-  })
-  ->Arg(512);
-```
-
-<a name="using-register-benchmark" />
-
-### Using RegisterBenchmark(name, fn, args...)
-
-The `RegisterBenchmark(name, func, args...)` function provides an alternative
-way to create and register benchmarks.
-`RegisterBenchmark(name, func, args...)` creates, registers, and returns a
-pointer to a new benchmark with the specified `name` that invokes
-`func(st, args...)` where `st` is a `benchmark::State` object.
-
-Unlike the `BENCHMARK` registration macros, which can only be used at the global
-scope, the `RegisterBenchmark` can be called anywhere. This allows for
-benchmark tests to be registered programmatically.
-
-Additionally `RegisterBenchmark` allows any callable object to be registered
-as a benchmark. Including capturing lambdas and function objects.
-
-For Example:
-```c++
-auto BM_test = [](benchmark::State& st, auto Inputs) { /* ... */ };
-
-int main(int argc, char** argv) {
-  for (auto& test_input : { /* ... */ })
-      benchmark::RegisterBenchmark(test_input.name(), BM_test, test_input);
-  benchmark::Initialize(&argc, argv);
-  benchmark::RunSpecifiedBenchmarks();
-}
-```
-
-<a name="exiting-with-an-error" />
-
-### Exiting with an Error
-
-When errors caused by external influences, such as file I/O and network
-communication, occur within a benchmark the
-`State::SkipWithError(const char* msg)` function can be used to skip that run
-of benchmark and report the error. Note that only future iterations of the
-`KeepRunning()` are skipped. For the ranged-for version of the benchmark loop
-Users must explicitly exit the loop, otherwise all iterations will be performed.
-Users may explicitly return to exit the benchmark immediately.
-
-The `SkipWithError(...)` function may be used at any point within the benchmark,
-including before and after the benchmark loop.
-
-For example:
-
-```c++
-static void BM_test(benchmark::State& state) {
-  auto resource = GetResource();
-  if (!resource.good()) {
-      state.SkipWithError("Resource is not good!");
-      // KeepRunning() loop will not be entered.
-  }
-  for (state.KeepRunning()) {
-      auto data = resource.read_data();
-      if (!resource.good()) {
-        state.SkipWithError("Failed to read data!");
-        break; // Needed to skip the rest of the iteration.
-     }
-     do_stuff(data);
-  }
-}
-
-static void BM_test_ranged_fo(benchmark::State & state) {
-  state.SkipWithError("test will not be entered");
-  for (auto _ : state) {
-    state.SkipWithError("Failed!");
-    break; // REQUIRED to prevent all further iterations.
-  }
-}
-```
-<a name="a-faster-keep-running-loop" />
-
-### A Faster KeepRunning Loop
-
-In C++11 mode, a ranged-based for loop should be used in preference to
-the `KeepRunning` loop for running the benchmarks. For example:
-
-```c++
-static void BM_Fast(benchmark::State &state) {
-  for (auto _ : state) {
-    FastOperation();
-  }
-}
-BENCHMARK(BM_Fast);
-```
-
-The reason the ranged-for loop is faster than using `KeepRunning`, is
-because `KeepRunning` requires a memory load and store of the iteration count
-ever iteration, whereas the ranged-for variant is able to keep the iteration count
-in a register.
-
-For example, an empty inner loop of using the ranged-based for method looks like:
-
-```asm
-# Loop Init
-  mov rbx, qword ptr [r14 + 104]
-  call benchmark::State::StartKeepRunning()
-  test rbx, rbx
-  je .LoopEnd
-.LoopHeader: # =>This Inner Loop Header: Depth=1
-  add rbx, -1
-  jne .LoopHeader
-.LoopEnd:
-```
-
-Compared to an empty `KeepRunning` loop, which looks like:
-
-```asm
-.LoopHeader: # in Loop: Header=BB0_3 Depth=1
-  cmp byte ptr [rbx], 1
-  jne .LoopInit
-.LoopBody: # =>This Inner Loop Header: Depth=1
-  mov rax, qword ptr [rbx + 8]
-  lea rcx, [rax + 1]
-  mov qword ptr [rbx + 8], rcx
-  cmp rax, qword ptr [rbx + 104]
-  jb .LoopHeader
-  jmp .LoopEnd
-.LoopInit:
-  mov rdi, rbx
-  call benchmark::State::StartKeepRunning()
-  jmp .LoopBody
-.LoopEnd:
-```
-
-Unless C++03 compatibility is required, the ranged-for variant of writing
-the benchmark loop should be preferred.
-
-<a name="disabling-cpu-frequency-scaling" />
-
-### Disabling CPU Frequency Scaling
-If you see this error:
-```
-***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
-```
-you might want to disable the CPU frequency scaling while running the benchmark:
-```bash
-sudo cpupower frequency-set --governor performance
-./mybench
-sudo cpupower frequency-set --governor powersave
+flag for option information or see the [User Guide](docs/user_guide.md).
+
+### Usage with CMake
+
+If using CMake, it is recommended to link against the project-provided
+`benchmark::benchmark` and `benchmark::benchmark_main` targets using
+`target_link_libraries`.
+It is possible to use ```find_package``` to import an installed version of the
+library.
+```cmake
+find_package(benchmark REQUIRED)
+```
+Alternatively, ```add_subdirectory``` will incorporate the library directly in
+to one's CMake project.
+```cmake
+add_subdirectory(benchmark)
+```
+Either way, link to the library as follows.
+```cmake
+target_link_libraries(MyTarget benchmark::benchmark)
 ```
diff --git a/ThirdParty/googlebenchmark/README.omnisci.md b/ThirdParty/googlebenchmark/README.omnisci.md
deleted file mode 100644
index 770e0c9a1d..0000000000
--- a/ThirdParty/googlebenchmark/README.omnisci.md
+++ /dev/null
@@ -1,3 +0,0 @@
-The provided CMake files have been modified to silence various `message(STATUS)` calls. Grep for `# message` to find all of them.
-
-Also disabled option `BENCHMARK_ENABLE_INSTALL` so that the libs are not included in our packages.
diff --git a/ThirdParty/googlebenchmark/WORKSPACE b/ThirdParty/googlebenchmark/WORKSPACE
index 9a75f968d9..833590f289 100644
--- a/ThirdParty/googlebenchmark/WORKSPACE
+++ b/ThirdParty/googlebenchmark/WORKSPACE
@@ -1,9 +1,22 @@
 workspace(name = "com_github_google_benchmark")
 
-load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+load("//:bazel/benchmark_deps.bzl", "benchmark_deps")
 
-http_archive(
-     name = "com_google_googletest",
-     urls = ["https://github.com/google/googletest/archive/3f0cf6b62ad1eb50d8736538363d3580dd640c3e.zip"],
-     strip_prefix = "googletest-3f0cf6b62ad1eb50d8736538363d3580dd640c3e",
+benchmark_deps()
+
+load("@rules_foreign_cc//foreign_cc:repositories.bzl", "rules_foreign_cc_dependencies")
+
+rules_foreign_cc_dependencies()
+
+load("@rules_python//python:pip.bzl", pip3_install="pip_install")
+
+pip3_install(
+   name = "tools_pip_deps",
+   requirements = "//tools:requirements.txt",
+)
+
+new_local_repository(
+    name = "python_headers",
+    build_file = "@//bindings/python:python_headers.BUILD",
+    path = "<PYTHON_INCLUDE_PATH>",  # May be overwritten by setup.py.
 )
diff --git a/ThirdParty/googlebenchmark/_config.yml b/ThirdParty/googlebenchmark/_config.yml
index 18854876c6..1fa5ff852b 100644
--- a/ThirdParty/googlebenchmark/_config.yml
+++ b/ThirdParty/googlebenchmark/_config.yml
@@ -1 +1,2 @@
-theme: jekyll-theme-midnight
\ No newline at end of file
+theme: jekyll-theme-midnight
+markdown: GFM
diff --git a/ThirdParty/googlebenchmark/appveyor.yml b/ThirdParty/googlebenchmark/appveyor.yml
index cf240190be..81da955f02 100644
--- a/ThirdParty/googlebenchmark/appveyor.yml
+++ b/ThirdParty/googlebenchmark/appveyor.yml
@@ -41,7 +41,7 @@ build_script:
   - cmake --build . --config %configuration%
 
 test_script:
-  - ctest -c %configuration% --timeout 300 --output-on-failure
+  - ctest --build-config %configuration% --timeout 300 --output-on-failure
 
 artifacts:
   - path: '_build/CMakeFiles/*.log'
diff --git a/ThirdParty/googlebenchmark/cmake/AddCXXCompilerFlag.cmake b/ThirdParty/googlebenchmark/cmake/AddCXXCompilerFlag.cmake
index d0d2099814..858589e977 100644
--- a/ThirdParty/googlebenchmark/cmake/AddCXXCompilerFlag.cmake
+++ b/ThirdParty/googlebenchmark/cmake/AddCXXCompilerFlag.cmake
@@ -34,9 +34,11 @@ function(add_cxx_compiler_flag FLAG)
   check_cxx_compiler_flag("${FLAG}" ${MANGLED_FLAG})
   set(CMAKE_REQUIRED_FLAGS "${OLD_CMAKE_REQUIRED_FLAGS}")
   if(${MANGLED_FLAG})
-    set(VARIANT ${ARGV1})
-    if(ARGV1)
+    if(ARGC GREATER 1)
+      set(VARIANT ${ARGV1})
       string(TOUPPER "_${VARIANT}" VARIANT)
+    else()
+      set(VARIANT "")
     endif()
     set(CMAKE_CXX_FLAGS${VARIANT} "${CMAKE_CXX_FLAGS${VARIANT}} ${BENCHMARK_CXX_FLAGS${VARIANT}} ${FLAG}" PARENT_SCOPE)
   endif()
@@ -49,9 +51,11 @@ function(add_required_cxx_compiler_flag FLAG)
   check_cxx_compiler_flag("${FLAG}" ${MANGLED_FLAG})
   set(CMAKE_REQUIRED_FLAGS "${OLD_CMAKE_REQUIRED_FLAGS}")
   if(${MANGLED_FLAG})
-    set(VARIANT ${ARGV1})
-    if(ARGV1)
+    if(ARGC GREATER 1)
+      set(VARIANT ${ARGV1})
       string(TOUPPER "_${VARIANT}" VARIANT)
+    else()
+      set(VARIANT "")
     endif()
     set(CMAKE_CXX_FLAGS${VARIANT} "${CMAKE_CXX_FLAGS${VARIANT}} ${FLAG}" PARENT_SCOPE)
     set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${FLAG}" PARENT_SCOPE)
diff --git a/ThirdParty/googlebenchmark/cmake/CXXFeatureCheck.cmake b/ThirdParty/googlebenchmark/cmake/CXXFeatureCheck.cmake
index 6c72bb7333..e51482659b 100644
--- a/ThirdParty/googlebenchmark/cmake/CXXFeatureCheck.cmake
+++ b/ThirdParty/googlebenchmark/cmake/CXXFeatureCheck.cmake
@@ -17,6 +17,8 @@ if(__cxx_feature_check)
 endif()
 set(__cxx_feature_check INCLUDED)
 
+option(CXXFEATURECHECK_DEBUG OFF)
+
 function(cxx_feature_check FILE)
   string(TOLOWER ${FILE} FILE)
   string(TOUPPER ${FILE} VAR)
@@ -27,38 +29,54 @@ function(cxx_feature_check FILE)
     return()
   endif()
 
+  set(FEATURE_CHECK_CMAKE_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS})
+  if (ARGC GREATER 1)
+    message(STATUS "Enabling additional flags: ${ARGV1}")
+    list(APPEND FEATURE_CHECK_CMAKE_FLAGS ${ARGV1})
+  endif()
+
   if (NOT DEFINED COMPILE_${FEATURE})
-    message(STATUS "Performing Test ${FEATURE}")
     if(CMAKE_CROSSCOMPILING)
+      message(STATUS "Cross-compiling to test ${FEATURE}")
       try_compile(COMPILE_${FEATURE}
               ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${FILE}.cpp
-              CMAKE_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS}
-              LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES})
+              CXX_STANDARD 11
+              CXX_STANDARD_REQUIRED ON
+              CMAKE_FLAGS ${FEATURE_CHECK_CMAKE_FLAGS}
+              LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES}
+              OUTPUT_VARIABLE COMPILE_OUTPUT_VAR)
       if(COMPILE_${FEATURE})
         message(WARNING
               "If you see build failures due to cross compilation, try setting HAVE_${VAR} to 0")
-        set(RUN_${FEATURE} 0)
+        set(RUN_${FEATURE} 0 CACHE INTERNAL "")
       else()
-        set(RUN_${FEATURE} 1)
+        set(RUN_${FEATURE} 1 CACHE INTERNAL "")
       endif()
     else()
-      message(STATUS "Performing Test ${FEATURE}")
+      message(STATUS "Compiling and running to test ${FEATURE}")
       try_run(RUN_${FEATURE} COMPILE_${FEATURE}
               ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${FILE}.cpp
-              CMAKE_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS}
-              LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES})
+              CXX_STANDARD 11
+              CXX_STANDARD_REQUIRED ON
+              CMAKE_FLAGS ${FEATURE_CHECK_CMAKE_FLAGS}
+              LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES}
+              COMPILE_OUTPUT_VARIABLE COMPILE_OUTPUT_VAR)
     endif()
   endif()
 
   if(RUN_${FEATURE} EQUAL 0)
-    # message(STATUS "Performing Test ${FEATURE} -- success")
+    message(STATUS "Performing Test ${FEATURE} -- success")
     set(HAVE_${VAR} 1 PARENT_SCOPE)
     add_definitions(-DHAVE_${VAR})
   else()
     if(NOT COMPILE_${FEATURE})
-      # message(STATUS "Performing Test ${FEATURE} -- failed to compile")
+      if(CXXFEATURECHECK_DEBUG)
+        message(STATUS "Performing Test ${FEATURE} -- failed to compile: ${COMPILE_OUTPUT_VAR}")
+      else()
+        message(STATUS "Performing Test ${FEATURE} -- failed to compile")
+      endif()
     else()
-      # message(STATUS "Performing Test ${FEATURE} -- compiled but failed to run")
+      message(STATUS "Performing Test ${FEATURE} -- compiled but failed to run")
     endif()
   endif()
 endfunction()
diff --git a/ThirdParty/googlebenchmark/cmake/Config.cmake.in b/ThirdParty/googlebenchmark/cmake/Config.cmake.in
index 6e9256eea8..2e15f0cf82 100644
--- a/ThirdParty/googlebenchmark/cmake/Config.cmake.in
+++ b/ThirdParty/googlebenchmark/cmake/Config.cmake.in
@@ -1 +1,7 @@
+@PACKAGE_INIT@
+
+include (CMakeFindDependencyMacro)
+
+find_dependency (Threads)
+
 include("${CMAKE_CURRENT_LIST_DIR}/@targets_export_name@.cmake")
diff --git a/ThirdParty/googlebenchmark/cmake/GetGitVersion.cmake b/ThirdParty/googlebenchmark/cmake/GetGitVersion.cmake
index defb2cb7a7..04a1f9b70d 100644
--- a/ThirdParty/googlebenchmark/cmake/GetGitVersion.cmake
+++ b/ThirdParty/googlebenchmark/cmake/GetGitVersion.cmake
@@ -20,16 +20,20 @@ set(__get_git_version INCLUDED)
 
 function(get_git_version var)
   if(GIT_EXECUTABLE)
-      execute_process(COMMAND ${GIT_EXECUTABLE} describe --match "v[0-9]*.[0-9]*.[0-9]*" --abbrev=8
+      execute_process(COMMAND ${GIT_EXECUTABLE} describe --tags --match "v[0-9]*.[0-9]*.[0-9]*" --abbrev=8
           WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
           RESULT_VARIABLE status
-          OUTPUT_VARIABLE GIT_VERSION
+          OUTPUT_VARIABLE GIT_DESCRIBE_VERSION
           ERROR_QUIET)
-      if(${status})
-          set(GIT_VERSION "v0.0.0")
+      if(status)
+          set(GIT_DESCRIBE_VERSION "v0.0.0")
+      endif()
+      
+      string(STRIP ${GIT_DESCRIBE_VERSION} GIT_DESCRIBE_VERSION)
+      if(GIT_DESCRIBE_VERSION MATCHES v[^-]*-) 
+         string(REGEX REPLACE "v([^-]*)-([0-9]+)-.*" "\\1.\\2"  GIT_VERSION ${GIT_DESCRIBE_VERSION})
       else()
-          string(STRIP ${GIT_VERSION} GIT_VERSION)
-          string(REGEX REPLACE "-[0-9]+-g" "-" GIT_VERSION ${GIT_VERSION})
+         string(REGEX REPLACE "v(.*)" "\\1" GIT_VERSION ${GIT_DESCRIBE_VERSION})
       endif()
 
       # Work out if the repository is dirty
@@ -43,12 +47,12 @@ function(get_git_version var)
           ERROR_QUIET)
       string(COMPARE NOTEQUAL "${GIT_DIFF_INDEX}" "" GIT_DIRTY)
       if (${GIT_DIRTY})
-          set(GIT_VERSION "${GIT_VERSION}-dirty")
+          set(GIT_DESCRIBE_VERSION "${GIT_DESCRIBE_VERSION}-dirty")
       endif()
+      message(STATUS "git version: ${GIT_DESCRIBE_VERSION} normalized to ${GIT_VERSION}")
   else()
-      set(GIT_VERSION "v0.0.0")
+      set(GIT_VERSION "0.0.0")
   endif()
 
-  # message(STATUS "git Version: ${GIT_VERSION}")
   set(${var} ${GIT_VERSION} PARENT_SCOPE)
 endfunction()
diff --git a/ThirdParty/googlebenchmark/cmake/GoogleTest.cmake b/ThirdParty/googlebenchmark/cmake/GoogleTest.cmake
index fb7c6be25e..e66e9d1a20 100644
--- a/ThirdParty/googlebenchmark/cmake/GoogleTest.cmake
+++ b/ThirdParty/googlebenchmark/cmake/GoogleTest.cmake
@@ -2,7 +2,7 @@
 set(GOOGLETEST_PREFIX "${benchmark_BINARY_DIR}/third_party/googletest")
 configure_file(${benchmark_SOURCE_DIR}/cmake/GoogleTest.cmake.in ${GOOGLETEST_PREFIX}/CMakeLists.txt @ONLY)
 
-set(GOOGLETEST_PATH "${CMAKE_CURRENT_SOURCE_DIR}/googletest") # Mind the quotes
+set(GOOGLETEST_PATH "${CMAKE_CURRENT_SOURCE_DIR}/googletest" CACHE PATH "") # Mind the quotes
 execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}"
   -DALLOW_DOWNLOADING_GOOGLETEST=${BENCHMARK_DOWNLOAD_DEPENDENCIES} -DGOOGLETEST_PATH:PATH=${GOOGLETEST_PATH} .
   RESULT_VARIABLE result
@@ -35,7 +35,24 @@ add_subdirectory(${GOOGLETEST_SOURCE_DIR}
                  ${GOOGLETEST_BINARY_DIR}
                  EXCLUDE_FROM_ALL)
 
-set_target_properties(gtest PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gtest,INTERFACE_INCLUDE_DIRECTORIES>)
-set_target_properties(gtest_main PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gtest_main,INTERFACE_INCLUDE_DIRECTORIES>)
-set_target_properties(gmock PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gmock,INTERFACE_INCLUDE_DIRECTORIES>)
-set_target_properties(gmock_main PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gmock_main,INTERFACE_INCLUDE_DIRECTORIES>)
+# googletest doesn't seem to want to stay build warning clean so let's not hurt ourselves.
+if (MSVC)
+  target_compile_options(gtest PRIVATE "/wd4244" "/wd4722")
+  target_compile_options(gtest_main PRIVATE "/wd4244" "/wd4722")
+  target_compile_options(gmock PRIVATE "/wd4244" "/wd4722")
+  target_compile_options(gmock_main PRIVATE "/wd4244" "/wd4722")
+else()
+  target_compile_options(gtest PRIVATE "-w")
+  target_compile_options(gtest_main PRIVATE "-w")
+  target_compile_options(gmock PRIVATE "-w")
+  target_compile_options(gmock_main PRIVATE "-w")
+endif()
+
+if(NOT DEFINED GTEST_COMPILE_COMMANDS)
+    set(GTEST_COMPILE_COMMANDS ON)
+endif()
+
+set_target_properties(gtest PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gtest,INTERFACE_INCLUDE_DIRECTORIES> EXPORT_COMPILE_COMMANDS ${GTEST_COMPILE_COMMANDS})
+set_target_properties(gtest_main PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gtest_main,INTERFACE_INCLUDE_DIRECTORIES> EXPORT_COMPILE_COMMANDS ${GTEST_COMPILE_COMMANDS})
+set_target_properties(gmock PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gmock,INTERFACE_INCLUDE_DIRECTORIES> EXPORT_COMPILE_COMMANDS ${GTEST_COMPILE_COMMANDS})
+set_target_properties(gmock_main PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gmock_main,INTERFACE_INCLUDE_DIRECTORIES> EXPORT_COMPILE_COMMANDS ${GTEST_COMPILE_COMMANDS})
diff --git a/ThirdParty/googlebenchmark/cmake/GoogleTest.cmake.in b/ThirdParty/googlebenchmark/cmake/GoogleTest.cmake.in
index 28818ee293..ce653ac375 100644
--- a/ThirdParty/googlebenchmark/cmake/GoogleTest.cmake.in
+++ b/ThirdParty/googlebenchmark/cmake/GoogleTest.cmake.in
@@ -31,13 +31,14 @@ if(EXISTS "${GOOGLETEST_PATH}"            AND IS_DIRECTORY "${GOOGLETEST_PATH}"
   )
 else()
   if(NOT ALLOW_DOWNLOADING_GOOGLETEST)
-    message(SEND_ERROR "Did not find Google Test sources! Either pass correct path in GOOGLETEST_PATH, or enable ALLOW_DOWNLOADING_GOOGLETEST, or disable BENCHMARK_ENABLE_GTEST_TESTS / BENCHMARK_ENABLE_TESTING.")
+    message(SEND_ERROR "Did not find Google Test sources! Either pass correct path in GOOGLETEST_PATH, or enable BENCHMARK_DOWNLOAD_DEPENDENCIES, or disable BENCHMARK_USE_BUNDLED_GTEST, or disable BENCHMARK_ENABLE_GTEST_TESTS / BENCHMARK_ENABLE_TESTING.")
+    return()
   else()
     message(WARNING "Did not find Google Test sources! Fetching from web...")
     ExternalProject_Add(
       googletest
       GIT_REPOSITORY    https://github.com/google/googletest.git
-      GIT_TAG           master
+      GIT_TAG           "release-1.11.0"
       PREFIX            "${CMAKE_BINARY_DIR}"
       STAMP_DIR         "${CMAKE_BINARY_DIR}/stamp"
       DOWNLOAD_DIR      "${CMAKE_BINARY_DIR}/download"
diff --git a/ThirdParty/googlebenchmark/cmake/benchmark.pc.in b/ThirdParty/googlebenchmark/cmake/benchmark.pc.in
index 43ca8f91d7..9dae881c79 100644
--- a/ThirdParty/googlebenchmark/cmake/benchmark.pc.in
+++ b/ThirdParty/googlebenchmark/cmake/benchmark.pc.in
@@ -1,7 +1,7 @@
 prefix=@CMAKE_INSTALL_PREFIX@
 exec_prefix=${prefix}
-libdir=${prefix}/lib
-includedir=${prefix}/include
+libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
 
 Name: @PROJECT_NAME@
 Description: Google microbenchmark framework
diff --git a/ThirdParty/googlebenchmark/cmake/pthread_affinity.cpp b/ThirdParty/googlebenchmark/cmake/pthread_affinity.cpp
new file mode 100644
index 0000000000..7b143bc021
--- /dev/null
+++ b/ThirdParty/googlebenchmark/cmake/pthread_affinity.cpp
@@ -0,0 +1,16 @@
+#include <pthread.h>
+int main() {
+  cpu_set_t set;
+  CPU_ZERO(&set);
+  for (int i = 0; i < CPU_SETSIZE; ++i) {
+    CPU_SET(i, &set);
+    CPU_CLR(i, &set);
+  }
+  pthread_t self = pthread_self();
+  int ret;
+  ret = pthread_getaffinity_np(self, sizeof(set), &set);
+  if (ret != 0) return ret;
+  ret = pthread_setaffinity_np(self, sizeof(set), &set);
+  if (ret != 0) return ret;
+  return 0;
+}
diff --git a/ThirdParty/googlebenchmark/conan/CMakeLists.txt b/ThirdParty/googlebenchmark/conan/CMakeLists.txt
deleted file mode 100644
index 15b92ca91a..0000000000
--- a/ThirdParty/googlebenchmark/conan/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-cmake_minimum_required(VERSION 2.8.11)
-project(cmake_wrapper)
-
-include(conanbuildinfo.cmake)
-conan_basic_setup()
-
-include(${CMAKE_SOURCE_DIR}/CMakeListsOriginal.txt)
diff --git a/ThirdParty/googlebenchmark/conan/test_package/CMakeLists.txt b/ThirdParty/googlebenchmark/conan/test_package/CMakeLists.txt
deleted file mode 100644
index 089a6c729d..0000000000
--- a/ThirdParty/googlebenchmark/conan/test_package/CMakeLists.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-cmake_minimum_required(VERSION 2.8.11)
-project(test_package)
-
-set(CMAKE_VERBOSE_MAKEFILE TRUE)
-
-include(${CMAKE_BINARY_DIR}/conanbuildinfo.cmake)
-conan_basic_setup()
-
-add_executable(${PROJECT_NAME} test_package.cpp)
-target_link_libraries(${PROJECT_NAME} ${CONAN_LIBS})
diff --git a/ThirdParty/googlebenchmark/conan/test_package/conanfile.py b/ThirdParty/googlebenchmark/conan/test_package/conanfile.py
deleted file mode 100644
index d63f4088c9..0000000000
--- a/ThirdParty/googlebenchmark/conan/test_package/conanfile.py
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-from conans import ConanFile, CMake
-import os
-
-
-class TestPackageConan(ConanFile):
-    settings = "os", "compiler", "build_type", "arch"
-    generators = "cmake"
-
-    def build(self):
-        cmake = CMake(self)
-        cmake.configure()
-        cmake.build()
-
-    def test(self):
-        bin_path = os.path.join("bin", "test_package")
-        self.run(bin_path, run_environment=True)
diff --git a/ThirdParty/googlebenchmark/conan/test_package/test_package.cpp b/ThirdParty/googlebenchmark/conan/test_package/test_package.cpp
deleted file mode 100644
index 4fa7ec0bf9..0000000000
--- a/ThirdParty/googlebenchmark/conan/test_package/test_package.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-#include "benchmark/benchmark.h"
-
-void BM_StringCreation(benchmark::State& state) {
-    while (state.KeepRunning())
-        std::string empty_string;
-}
-
-BENCHMARK(BM_StringCreation);
-
-void BM_StringCopy(benchmark::State& state) {
-    std::string x = "hello";
-    while (state.KeepRunning())
-        std::string copy(x);
-}
-
-BENCHMARK(BM_StringCopy);
-
-BENCHMARK_MAIN();
diff --git a/ThirdParty/googlebenchmark/conanfile.py b/ThirdParty/googlebenchmark/conanfile.py
deleted file mode 100644
index e31fc5268a..0000000000
--- a/ThirdParty/googlebenchmark/conanfile.py
+++ /dev/null
@@ -1,79 +0,0 @@
-from conans import ConanFile, CMake, tools
-from conans.errors import ConanInvalidConfiguration
-import shutil
-import os
-
-
-class GoogleBenchmarkConan(ConanFile):
-    name = "benchmark"
-    description = "A microbenchmark support library."
-    topics = ("conan", "benchmark", "google", "microbenchmark")
-    url = "https://github.com/google/benchmark"
-    homepage = "https://github.com/google/benchmark"
-    author = "Google Inc."
-    license = "Apache-2.0"
-    exports_sources = ["*"]
-    generators = "cmake"
-
-    settings = "arch", "build_type", "compiler", "os"
-    options = {
-        "shared": [True, False],
-        "fPIC": [True, False],
-        "enable_lto": [True, False],
-        "enable_exceptions": [True, False]
-    }
-    default_options = {"shared": False, "fPIC": True, "enable_lto": False, "enable_exceptions": True}
-
-    _build_subfolder = "."
-
-    def source(self):
-        # Wrap the original CMake file to call conan_basic_setup
-        shutil.move("CMakeLists.txt", "CMakeListsOriginal.txt")
-        shutil.move(os.path.join("conan", "CMakeLists.txt"), "CMakeLists.txt")
-
-    def config_options(self):
-        if self.settings.os == "Windows":
-            if self.settings.compiler == "Visual Studio" and float(self.settings.compiler.version.value) <= 12:
-                raise ConanInvalidConfiguration("{} {} does not support Visual Studio <= 12".format(self.name, self.version))
-            del self.options.fPIC
-
-    def configure(self):
-        if self.settings.os == "Windows" and self.options.shared:
-            raise ConanInvalidConfiguration("Windows shared builds are not supported right now, see issue #639")
-
-    def _configure_cmake(self):
-        cmake = CMake(self)
-
-        cmake.definitions["BENCHMARK_ENABLE_TESTING"] = "OFF"
-        cmake.definitions["BENCHMARK_ENABLE_GTEST_TESTS"] = "OFF"
-        cmake.definitions["BENCHMARK_ENABLE_LTO"] = "ON" if self.options.enable_lto else "OFF"
-        cmake.definitions["BENCHMARK_ENABLE_EXCEPTIONS"] = "ON" if self.options.enable_exceptions else "OFF"
-
-        # See https://github.com/google/benchmark/pull/638 for Windows 32 build explanation
-        if self.settings.os != "Windows":
-            cmake.definitions["BENCHMARK_BUILD_32_BITS"] = "ON" if "64" not in str(self.settings.arch) else "OFF"
-            cmake.definitions["BENCHMARK_USE_LIBCXX"] = "ON" if (str(self.settings.compiler.libcxx) == "libc++") else "OFF"
-        else:
-            cmake.definitions["BENCHMARK_USE_LIBCXX"] = "OFF"
-
-        cmake.configure(build_folder=self._build_subfolder)
-        return cmake
-
-    def build(self):
-        cmake = self._configure_cmake()
-        cmake.build()
-
-    def package(self):
-        cmake = self._configure_cmake()
-        cmake.install()
-
-        self.copy(pattern="LICENSE", dst="licenses")
-
-    def package_info(self):
-        self.cpp_info.libs = tools.collect_libs(self)
-        if self.settings.os == "Linux":
-            self.cpp_info.libs.extend(["pthread", "rt"])
-        elif self.settings.os == "Windows":
-            self.cpp_info.libs.append("shlwapi")
-        elif self.settings.os == "SunOS":
-            self.cpp_info.libs.append("kstat")
diff --git a/ThirdParty/googlebenchmark/dependencies.md b/ThirdParty/googlebenchmark/dependencies.md
deleted file mode 100644
index 6289b4e354..0000000000
--- a/ThirdParty/googlebenchmark/dependencies.md
+++ /dev/null
@@ -1,18 +0,0 @@
-# Build tool dependency policy
-
-To ensure the broadest compatibility when building the benchmark library, but
-still allow forward progress, we require any build tooling to be available for:
-
-* Debian stable AND
-* The last two Ubuntu LTS releases AND
-
-Currently, this means using build tool versions that are available for Ubuntu
-16.04 (Xenial), Ubuntu 18.04 (Bionic), and Debian stretch.
-
-_Note, [travis](.travis.yml) runs under Ubuntu 14.04 (Trusty) for linux builds._
-
-## cmake
-The current supported version is cmake 3.5.1 as of 2018-06-06.
-
-_Note, this version is also available for Ubuntu 14.04, the previous Ubuntu LTS
-release, as `cmake3`._
diff --git a/ThirdParty/googlebenchmark/docs/AssemblyTests.md b/ThirdParty/googlebenchmark/docs/AssemblyTests.md
new file mode 100644
index 0000000000..89df7ca520
--- /dev/null
+++ b/ThirdParty/googlebenchmark/docs/AssemblyTests.md
@@ -0,0 +1,149 @@
+# Assembly Tests
+
+The Benchmark library provides a number of functions whose primary
+purpose in to affect assembly generation, including `DoNotOptimize`
+and `ClobberMemory`. In addition there are other functions,
+such as `KeepRunning`, for which generating good assembly is paramount.
+
+For these functions it's important to have tests that verify the
+correctness and quality of the implementation. This requires testing
+the code generated by the compiler.
+
+This document describes how the Benchmark library tests compiler output,
+as well as how to properly write new tests.
+
+
+## Anatomy of a Test
+
+Writing a test has two steps:
+
+* Write the code you want to generate assembly for.
+* Add `// CHECK` lines to match against the verified assembly.
+
+Example:
+```c++
+
+// CHECK-LABEL: test_add:
+extern "C" int test_add() {
+    extern int ExternInt;
+    return ExternInt + 1;
+
+    // CHECK: movl ExternInt(%rip), %eax
+    // CHECK: addl %eax
+    // CHECK: ret
+}
+
+```
+
+#### LLVM Filecheck
+
+[LLVM's Filecheck](https://llvm.org/docs/CommandGuide/FileCheck.html)
+is used to test the generated assembly against the `// CHECK` lines
+specified in the tests source file. Please see the documentation
+linked above for information on how to write `CHECK` directives.
+
+#### Tips and Tricks:
+
+* Tests should match the minimal amount of output required to establish
+correctness. `CHECK` directives don't have to match on the exact next line
+after the previous match, so tests should omit checks for unimportant
+bits of assembly. ([`CHECK-NEXT`](https://llvm.org/docs/CommandGuide/FileCheck.html#the-check-next-directive)
+can be used to ensure a match occurs exactly after the previous match).
+
+* The tests are compiled with `-O3 -g0`. So we're only testing the
+optimized output.
+
+* The assembly output is further cleaned up using `tools/strip_asm.py`.
+This removes comments, assembler directives, and unused labels before
+the test is run.
+
+* The generated and stripped assembly file for a test is output under
+`<build-directory>/test/<test-name>.s`
+
+* Filecheck supports using [`CHECK` prefixes](https://llvm.org/docs/CommandGuide/FileCheck.html#cmdoption-check-prefixes)
+to specify lines that should only match in certain situations.
+The Benchmark tests use `CHECK-CLANG` and `CHECK-GNU` for lines that
+are only expected to match Clang or GCC's output respectively. Normal
+`CHECK` lines match against all compilers. (Note: `CHECK-NOT` and
+`CHECK-LABEL` are NOT prefixes. They are versions of non-prefixed
+`CHECK` lines)
+
+* Use `extern "C"` to disable name mangling for specific functions. This
+makes them easier to name in the `CHECK` lines.
+
+
+## Problems Writing Portable Tests
+
+Writing tests which check the code generated by a compiler are
+inherently non-portable. Different compilers and even different compiler
+versions may generate entirely different code. The Benchmark tests
+must tolerate this.
+
+LLVM Filecheck provides a number of mechanisms to help write
+"more portable" tests; including [matching using regular expressions](https://llvm.org/docs/CommandGuide/FileCheck.html#filecheck-pattern-matching-syntax),
+allowing the creation of [named variables](https://llvm.org/docs/CommandGuide/FileCheck.html#filecheck-variables)
+for later matching, and [checking non-sequential matches](https://llvm.org/docs/CommandGuide/FileCheck.html#the-check-dag-directive).
+
+#### Capturing Variables
+
+For example, say GCC stores a variable in a register but Clang stores
+it in memory. To write a test that tolerates both cases we "capture"
+the destination of the store, and then use the captured expression
+to write the remainder of the test.
+
+```c++
+// CHECK-LABEL: test_div_no_op_into_shr:
+extern "C" void test_div_no_op_into_shr(int value) {
+    int divisor = 2;
+    benchmark::DoNotOptimize(divisor); // hide the value from the optimizer
+    return value / divisor;
+
+    // CHECK: movl $2, [[DEST:.*]]
+    // CHECK: idivl [[DEST]]
+    // CHECK: ret
+}
+```
+
+#### Using Regular Expressions to Match Differing Output
+
+Often tests require testing assembly lines which may subtly differ
+between compilers or compiler versions. A common example of this
+is matching stack frame addresses. In this case regular expressions
+can be used to match the differing bits of output. For example:
+
+<!-- {% raw %} -->
+```c++
+int ExternInt;
+struct Point { int x, y, z; };
+
+// CHECK-LABEL: test_store_point:
+extern "C" void test_store_point() {
+    Point p{ExternInt, ExternInt, ExternInt};
+    benchmark::DoNotOptimize(p);
+
+    // CHECK: movl ExternInt(%rip), %eax
+    // CHECK: movl %eax, -{{[0-9]+}}(%rsp)
+    // CHECK: movl %eax, -{{[0-9]+}}(%rsp)
+    // CHECK: movl %eax, -{{[0-9]+}}(%rsp)
+    // CHECK: ret
+}
+```
+<!-- {% endraw %} -->
+
+## Current Requirements and Limitations
+
+The tests require Filecheck to be installed along the `PATH` of the
+build machine. Otherwise the tests will be disabled.
+
+Additionally, as mentioned in the previous section, codegen tests are
+inherently non-portable. Currently the tests are limited to:
+
+* x86_64 targets.
+* Compiled with GCC or Clang
+
+Further work could be done, at least on a limited basis, to extend the
+tests to other architectures and compilers (using `CHECK` prefixes).
+
+Furthermore, the tests fail for builds which specify additional flags
+that modify code generation, including `--coverage` or `-fsanitize=`.
+
diff --git a/ThirdParty/googlebenchmark/docs/_config.yml b/ThirdParty/googlebenchmark/docs/_config.yml
new file mode 100644
index 0000000000..32f9f2e0dd
--- /dev/null
+++ b/ThirdParty/googlebenchmark/docs/_config.yml
@@ -0,0 +1,3 @@
+theme: jekyll-theme-minimal
+logo: /assets/images/icon_black.png
+show_downloads: true
diff --git a/ThirdParty/googlebenchmark/docs/assets/images/icon.png b/ThirdParty/googlebenchmark/docs/assets/images/icon.png
new file mode 100644
index 0000000000..b98260486e
Binary files /dev/null and b/ThirdParty/googlebenchmark/docs/assets/images/icon.png differ
diff --git a/ThirdParty/googlebenchmark/docs/assets/images/icon.xcf b/ThirdParty/googlebenchmark/docs/assets/images/icon.xcf
new file mode 100644
index 0000000000..f2f0be415f
Binary files /dev/null and b/ThirdParty/googlebenchmark/docs/assets/images/icon.xcf differ
diff --git a/ThirdParty/googlebenchmark/docs/assets/images/icon_black.png b/ThirdParty/googlebenchmark/docs/assets/images/icon_black.png
new file mode 100644
index 0000000000..656ae797cf
Binary files /dev/null and b/ThirdParty/googlebenchmark/docs/assets/images/icon_black.png differ
diff --git a/ThirdParty/googlebenchmark/docs/assets/images/icon_black.xcf b/ThirdParty/googlebenchmark/docs/assets/images/icon_black.xcf
new file mode 100644
index 0000000000..430e7bafe5
Binary files /dev/null and b/ThirdParty/googlebenchmark/docs/assets/images/icon_black.xcf differ
diff --git a/ThirdParty/googlebenchmark/docs/dependencies.md b/ThirdParty/googlebenchmark/docs/dependencies.md
new file mode 100644
index 0000000000..07760e10e3
--- /dev/null
+++ b/ThirdParty/googlebenchmark/docs/dependencies.md
@@ -0,0 +1,13 @@
+# Build tool dependency policy
+
+We follow the [Foundational C++ support policy](https://opensource.google/documentation/policies/cplusplus-support) for our build tools. In
+particular the ["Build Systems" section](https://opensource.google/documentation/policies/cplusplus-support#build-systems).
+
+## CMake
+
+The current supported version is CMake 3.10 as of 2023-08-10. Most modern
+distributions include newer versions, for example:
+
+* Ubuntu 20.04 provides CMake 3.16.3
+* Debian 11.4 provides CMake 3.18.4
+* Ubuntu 22.04 provides CMake 3.22.1
diff --git a/ThirdParty/googlebenchmark/docs/index.md b/ThirdParty/googlebenchmark/docs/index.md
new file mode 100644
index 0000000000..9cada9688b
--- /dev/null
+++ b/ThirdParty/googlebenchmark/docs/index.md
@@ -0,0 +1,12 @@
+# Benchmark
+
+* [Assembly Tests](AssemblyTests.md)
+* [Dependencies](dependencies.md)
+* [Perf Counters](perf_counters.md)
+* [Platform Specific Build Instructions](platform_specific_build_instructions.md)
+* [Python Bindings](python_bindings.md)
+* [Random Interleaving](random_interleaving.md)
+* [Reducing Variance](reducing_variance.md)
+* [Releasing](releasing.md)
+* [Tools](tools.md)
+* [User Guide](user_guide.md)
diff --git a/ThirdParty/googlebenchmark/docs/perf_counters.md b/ThirdParty/googlebenchmark/docs/perf_counters.md
new file mode 100644
index 0000000000..f342092c99
--- /dev/null
+++ b/ThirdParty/googlebenchmark/docs/perf_counters.md
@@ -0,0 +1,35 @@
+<a name="perf-counters" />
+
+# User-Requested Performance Counters
+
+When running benchmarks, the user may choose to request collection of
+performance counters. This may be useful in investigation scenarios - narrowing
+down the cause of a regression; or verifying that the underlying cause of a
+performance improvement matches expectations.
+
+This feature is available if:
+
+* The benchmark is run on an architecture featuring a Performance Monitoring
+  Unit (PMU),
+* The benchmark is compiled with support for collecting counters. Currently,
+  this requires [libpfm](http://perfmon2.sourceforge.net/), which is built as a
+  dependency via Bazel.
+
+The feature does not require modifying benchmark code. Counter collection is
+handled at the boundaries where timer collection is also handled. 
+
+To opt-in:
+* If using a Bazel build, add `--define pfm=1` to your build flags
+* If using CMake:
+  * Install `libpfm4-dev`, e.g. `apt-get install libpfm4-dev`.
+  * Enable the CMake flag `BENCHMARK_ENABLE_LIBPFM` in `CMakeLists.txt`.
+
+To use, pass a comma-separated list of counter names through the
+`--benchmark_perf_counters` flag. The names are decoded through libpfm - meaning,
+they are platform specific, but some (e.g. `CYCLES` or `INSTRUCTIONS`) are
+mapped by libpfm to platform-specifics - see libpfm
+[documentation](http://perfmon2.sourceforge.net/docs.html) for more details.
+
+The counter values are reported back through the [User Counters](../README.md#custom-counters)
+mechanism, meaning, they are available in all the formats (e.g. JSON) supported
+by User Counters.
diff --git a/ThirdParty/googlebenchmark/docs/platform_specific_build_instructions.md b/ThirdParty/googlebenchmark/docs/platform_specific_build_instructions.md
new file mode 100644
index 0000000000..2d5d6c47ee
--- /dev/null
+++ b/ThirdParty/googlebenchmark/docs/platform_specific_build_instructions.md
@@ -0,0 +1,48 @@
+# Platform Specific Build Instructions
+
+## Building with GCC
+
+When the library is built using GCC it is necessary to link with the pthread
+library due to how GCC implements `std::thread`. Failing to link to pthread will
+lead to runtime exceptions (unless you're using libc++), not linker errors. See
+[issue #67](https://github.com/google/benchmark/issues/67) for more details. You
+can link to pthread by adding `-pthread` to your linker command. Note, you can
+also use `-lpthread`, but there are potential issues with ordering of command
+line parameters if you use that.
+
+On QNX, the pthread library is part of libc and usually included automatically
+(see
+[`pthread_create()`](https://www.qnx.com/developers/docs/7.1/index.html#com.qnx.doc.neutrino.lib_ref/topic/p/pthread_create.html)).
+There's no separate pthread library to link.
+
+## Building with Visual Studio 2015 or 2017
+
+The `shlwapi` library (`-lshlwapi`) is required to support a call to `CPUInfo` which reads the registry. Either add `shlwapi.lib` under `[ Configuration Properties > Linker > Input ]`, or use the following:
+
+```
+// Alternatively, can add libraries using linker options.
+#ifdef _WIN32
+#pragma comment ( lib, "Shlwapi.lib" )
+#ifdef _DEBUG
+#pragma comment ( lib, "benchmarkd.lib" )
+#else
+#pragma comment ( lib, "benchmark.lib" )
+#endif
+#endif
+```
+
+Can also use the graphical version of CMake:
+* Open `CMake GUI`.
+* Under `Where to build the binaries`, same path as source plus `build`.
+* Under `CMAKE_INSTALL_PREFIX`, same path as source plus `install`.
+* Click `Configure`, `Generate`, `Open Project`.
+* If build fails, try deleting entire directory and starting again, or unticking options to build less.
+
+## Building with Intel 2015 Update 1 or Intel System Studio Update 4
+
+See instructions for building with Visual Studio. Once built, right click on the solution and change the build to Intel.
+
+## Building on Solaris
+
+If you're running benchmarks on solaris, you'll want the kstat library linked in
+too (`-lkstat`).
\ No newline at end of file
diff --git a/ThirdParty/googlebenchmark/docs/python_bindings.md b/ThirdParty/googlebenchmark/docs/python_bindings.md
new file mode 100644
index 0000000000..6a7aab0a29
--- /dev/null
+++ b/ThirdParty/googlebenchmark/docs/python_bindings.md
@@ -0,0 +1,34 @@
+# Building and installing Python bindings
+
+Python bindings are available as wheels on [PyPI](https://pypi.org/project/google-benchmark/) for importing and 
+using Google Benchmark directly in Python. 
+Currently, pre-built wheels exist for macOS (both ARM64 and Intel x86), Linux x86-64 and 64-bit Windows.
+Supported Python versions are Python 3.7 - 3.10.
+
+To install Google Benchmark's Python bindings, run:
+
+```bash
+python -m pip install --upgrade pip  # for manylinux2014 support
+python -m pip install google-benchmark
+```
+
+In order to keep your system Python interpreter clean, it is advisable to run these commands in a virtual
+environment. See the [official Python documentation](https://docs.python.org/3/library/venv.html) 
+on how to create virtual environments.
+
+To build a wheel directly from source, you can follow these steps:
+```bash
+git clone https://github.com/google/benchmark.git
+cd benchmark
+# create a virtual environment and activate it
+python3 -m venv venv --system-site-packages
+source venv/bin/activate  # .\venv\Scripts\Activate.ps1 on Windows
+
+# upgrade Python's system-wide packages
+python -m pip install --upgrade pip setuptools wheel
+# builds the wheel and stores it in the directory "wheelhouse".
+python -m pip wheel . -w wheelhouse
+```
+
+NB: Building wheels from source requires Bazel. For platform-specific instructions on how to install Bazel,
+refer to the [Bazel installation docs](https://bazel.build/install).
diff --git a/ThirdParty/googlebenchmark/docs/random_interleaving.md b/ThirdParty/googlebenchmark/docs/random_interleaving.md
new file mode 100644
index 0000000000..c083036841
--- /dev/null
+++ b/ThirdParty/googlebenchmark/docs/random_interleaving.md
@@ -0,0 +1,13 @@
+<a name="interleaving" />
+
+# Random Interleaving
+
+[Random Interleaving](https://github.com/google/benchmark/issues/1051) is a
+technique to lower run-to-run variance. It randomly interleaves repetitions of a
+microbenchmark with repetitions from other microbenchmarks in the same benchmark
+test. Data shows it is able to lower run-to-run variance by
+[40%](https://github.com/google/benchmark/issues/1051) on average.
+
+To use, you mainly need to set `--benchmark_enable_random_interleaving=true`,
+and optionally specify non-zero repetition count `--benchmark_repetitions=9`
+and optionally decrease the per-repetition time `--benchmark_min_time=0.1`.
diff --git a/ThirdParty/googlebenchmark/docs/reducing_variance.md b/ThirdParty/googlebenchmark/docs/reducing_variance.md
new file mode 100644
index 0000000000..e566ab9852
--- /dev/null
+++ b/ThirdParty/googlebenchmark/docs/reducing_variance.md
@@ -0,0 +1,100 @@
+# Reducing Variance
+
+<a name="disabling-cpu-frequency-scaling" />
+
+## Disabling CPU Frequency Scaling
+
+If you see this error:
+
+```
+***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
+```
+
+you might want to disable the CPU frequency scaling while running the
+benchmark, as well as consider other ways to stabilize the performance of
+your system while benchmarking.
+
+See [Reducing Variance](reducing_variance.md) for more information.
+
+Exactly how to do this depends on the Linux distribution,
+desktop environment, and installed programs.  Specific details are a moving
+target, so we will not attempt to exhaustively document them here.
+
+One simple option is to use the `cpupower` program to change the
+performance governor to "performance".  This tool is maintained along with
+the Linux kernel and provided by your distribution.
+
+It must be run as root, like this:
+
+```bash
+sudo cpupower frequency-set --governor performance
+```
+
+After this you can verify that all CPUs are using the performance governor
+by running this command:
+
+```bash
+cpupower frequency-info -o proc
+```
+
+The benchmarks you subsequently run will have less variance.
+
+<a name="reducing-variance" />
+
+## Reducing Variance in Benchmarks
+
+The Linux CPU frequency governor [discussed
+above](user_guide#disabling-cpu-frequency-scaling) is not the only source
+of noise in benchmarks.  Some, but not all, of the sources of variance
+include:
+
+1. On multi-core machines not all CPUs/CPU cores/CPU threads run the same
+   speed, so running a benchmark one time and then again may give a
+   different result depending on which CPU it ran on.
+2. CPU scaling features that run on the CPU, like Intel's Turbo Boost and
+   AMD Turbo Core and Precision Boost, can temporarily change the CPU
+   frequency even when the using the "performance" governor on Linux.
+3. Context switching between CPUs, or scheduling competition on the CPU the
+   benchmark is running on.
+4. Intel Hyperthreading or AMD SMT causing the same issue as above.
+5. Cache effects caused by code running on other CPUs.
+6. Non-uniform memory architectures (NUMA).
+
+These can cause variance in benchmarks results within a single run
+(`--benchmark_repetitions=N`) or across multiple runs of the benchmark
+program.
+
+Reducing sources of variance is OS and architecture dependent, which is one
+reason some companies maintain machines dedicated to performance testing.
+
+Some of the easier and and effective ways of reducing variance on a typical
+Linux workstation are:
+
+1. Use the performance governor as [discussed
+above](user_guide#disabling-cpu-frequency-scaling).
+1. Disable processor boosting by:
+   ```sh
+   echo 0 | sudo tee /sys/devices/system/cpu/cpufreq/boost
+   ```
+   See the Linux kernel's
+   [boost.txt](https://www.kernel.org/doc/Documentation/cpu-freq/boost.txt)
+   for more information.
+2. Set the benchmark program's task affinity to a fixed cpu.  For example:
+   ```sh
+   taskset -c 0 ./mybenchmark
+   ```
+3. Disabling Hyperthreading/SMT.  This can be done in the Bios or using the
+   `/sys` file system (see the LLVM project's [Benchmarking
+   tips](https://llvm.org/docs/Benchmarking.html)).
+4. Close other programs that do non-trivial things based on timers, such as
+   your web browser, desktop environment, etc.
+5. Reduce the working set of your benchmark to fit within the L1 cache, but
+   do be aware that this may lead you to optimize for an unrelistic
+   situation.
+
+Further resources on this topic:
+
+1. The LLVM project's [Benchmarking
+   tips](https://llvm.org/docs/Benchmarking.html).
+1. The Arch Wiki [Cpu frequency
+scaling](https://wiki.archlinux.org/title/CPU_frequency_scaling) page.
diff --git a/ThirdParty/googlebenchmark/docs/releasing.md b/ThirdParty/googlebenchmark/docs/releasing.md
new file mode 100644
index 0000000000..cdf415997a
--- /dev/null
+++ b/ThirdParty/googlebenchmark/docs/releasing.md
@@ -0,0 +1,41 @@
+# How to release
+
+* Make sure you're on main and synced to HEAD
+* Ensure the project builds and tests run
+    * `parallel -j0 exec ::: test/*_test` can help ensure everything at least
+      passes
+* Prepare release notes
+    * `git log $(git describe --abbrev=0 --tags)..HEAD` gives you the list of
+      commits between the last annotated tag and HEAD
+    * Pick the most interesting.
+* Create one last commit that updates the version saved in `CMakeLists.txt`, `MODULE.bazel`
+  and the `__version__` variable in `bindings/python/google_benchmark/__init__.py`to the
+  release version you're creating. (This version will be used if benchmark is installed
+  from the archive you'll be creating in the next step.)
+
+```
+project (benchmark VERSION 1.8.0 LANGUAGES CXX)
+```
+
+```
+module(name = "com_github_google_benchmark", version="1.8.0")
+```
+
+```python
+# bindings/python/google_benchmark/__init__.py
+
+# ...
+
+__version__ = "1.8.0"  # <-- change this to the release version you are creating
+
+# ...
+```
+
+* Create a release through github's interface
+    * Note this will create a lightweight tag.
+    * Update this to an annotated tag:
+      * `git pull --tags`
+      * `git tag -a -f <tag> <tag>`
+      * `git push --force --tags origin`
+* Confirm that the "Build and upload Python wheels" action runs to completion
+    * run it manually if it hasn't run
diff --git a/ThirdParty/googlebenchmark/docs/tools.md b/ThirdParty/googlebenchmark/docs/tools.md
new file mode 100644
index 0000000000..411f41d405
--- /dev/null
+++ b/ThirdParty/googlebenchmark/docs/tools.md
@@ -0,0 +1,343 @@
+# Benchmark Tools
+
+## compare.py
+
+The `compare.py` can be used to compare the result of benchmarks.
+
+### Dependencies
+The utility relies on the [scipy](https://www.scipy.org) package which can be installed using pip:
+```bash
+pip3 install -r requirements.txt
+```
+
+### Displaying aggregates only
+
+The switch `-a` / `--display_aggregates_only` can be used to control the
+displayment of the normal iterations vs the aggregates. When passed, it will
+be passthrough to the benchmark binaries to be run, and will be accounted for
+in the tool itself; only the aggregates will be displayed, but not normal runs.
+It only affects the display, the separate runs will still be used to calculate
+the U test.
+
+### Modes of operation
+
+There are three modes of operation:
+
+1. Just compare two benchmarks
+The program is invoked like:
+
+``` bash
+$ compare.py benchmarks <benchmark_baseline> <benchmark_contender> [benchmark options]...
+```
+Where `<benchmark_baseline>` and `<benchmark_contender>` either specify a benchmark executable file, or a JSON output file. The type of the input file is automatically detected. If a benchmark executable is specified then the benchmark is run to obtain the results. Otherwise the results are simply loaded from the output file.
+
+`[benchmark options]` will be passed to the benchmarks invocations. They can be anything that binary accepts, be it either normal `--benchmark_*` parameters, or some custom parameters your binary takes.
+
+Example output:
+```
+$ ./compare.py benchmarks ./a.out ./a.out
+RUNNING: ./a.out --benchmark_out=/tmp/tmprBT5nW
+Run on (8 X 4000 MHz CPU s)
+2017-11-07 21:16:44
+------------------------------------------------------
+Benchmark               Time           CPU Iterations
+------------------------------------------------------
+BM_memcpy/8            36 ns         36 ns   19101577   211.669MB/s
+BM_memcpy/64           76 ns         76 ns    9412571   800.199MB/s
+BM_memcpy/512          84 ns         84 ns    8249070   5.64771GB/s
+BM_memcpy/1024        116 ns        116 ns    6181763   8.19505GB/s
+BM_memcpy/8192        643 ns        643 ns    1062855   11.8636GB/s
+BM_copy/8             222 ns        222 ns    3137987   34.3772MB/s
+BM_copy/64           1608 ns       1608 ns     432758   37.9501MB/s
+BM_copy/512         12589 ns      12589 ns      54806   38.7867MB/s
+BM_copy/1024        25169 ns      25169 ns      27713   38.8003MB/s
+BM_copy/8192       201165 ns     201112 ns       3486   38.8466MB/s
+RUNNING: ./a.out --benchmark_out=/tmp/tmpt1wwG_
+Run on (8 X 4000 MHz CPU s)
+2017-11-07 21:16:53
+------------------------------------------------------
+Benchmark               Time           CPU Iterations
+------------------------------------------------------
+BM_memcpy/8            36 ns         36 ns   19397903   211.255MB/s
+BM_memcpy/64           73 ns         73 ns    9691174   839.635MB/s
+BM_memcpy/512          85 ns         85 ns    8312329   5.60101GB/s
+BM_memcpy/1024        118 ns        118 ns    6438774   8.11608GB/s
+BM_memcpy/8192        656 ns        656 ns    1068644   11.6277GB/s
+BM_copy/8             223 ns        223 ns    3146977   34.2338MB/s
+BM_copy/64           1611 ns       1611 ns     435340   37.8751MB/s
+BM_copy/512         12622 ns      12622 ns      54818   38.6844MB/s
+BM_copy/1024        25257 ns      25239 ns      27779   38.6927MB/s
+BM_copy/8192       205013 ns     205010 ns       3479    38.108MB/s
+Comparing ./a.out to ./a.out
+Benchmark                 Time             CPU      Time Old      Time New       CPU Old       CPU New
+------------------------------------------------------------------------------------------------------
+BM_memcpy/8            +0.0020         +0.0020            36            36            36            36
+BM_memcpy/64           -0.0468         -0.0470            76            73            76            73
+BM_memcpy/512          +0.0081         +0.0083            84            85            84            85
+BM_memcpy/1024         +0.0098         +0.0097           116           118           116           118
+BM_memcpy/8192         +0.0200         +0.0203           643           656           643           656
+BM_copy/8              +0.0046         +0.0042           222           223           222           223
+BM_copy/64             +0.0020         +0.0020          1608          1611          1608          1611
+BM_copy/512            +0.0027         +0.0026         12589         12622         12589         12622
+BM_copy/1024           +0.0035         +0.0028         25169         25257         25169         25239
+BM_copy/8192           +0.0191         +0.0194        201165        205013        201112        205010
+```
+
+What it does is for the every benchmark from the first run it looks for the benchmark with exactly the same name in the second run, and then compares the results. If the names differ, the benchmark is omitted from the diff.
+As you can note, the values in `Time` and `CPU` columns are calculated as `(new - old) / |old|`.
+
+2. Compare two different filters of one benchmark
+The program is invoked like:
+
+``` bash
+$ compare.py filters <benchmark> <filter_baseline> <filter_contender> [benchmark options]...
+```
+Where `<benchmark>` either specify a benchmark executable file, or a JSON output file. The type of the input file is automatically detected. If a benchmark executable is specified then the benchmark is run to obtain the results. Otherwise the results are simply loaded from the output file.
+
+Where `<filter_baseline>` and `<filter_contender>` are the same regex filters that you would pass to the `[--benchmark_filter=<regex>]` parameter of the benchmark binary.
+
+`[benchmark options]` will be passed to the benchmarks invocations. They can be anything that binary accepts, be it either normal `--benchmark_*` parameters, or some custom parameters your binary takes.
+
+Example output:
+```
+$ ./compare.py filters ./a.out BM_memcpy BM_copy
+RUNNING: ./a.out --benchmark_filter=BM_memcpy --benchmark_out=/tmp/tmpBWKk0k
+Run on (8 X 4000 MHz CPU s)
+2017-11-07 21:37:28
+------------------------------------------------------
+Benchmark               Time           CPU Iterations
+------------------------------------------------------
+BM_memcpy/8            36 ns         36 ns   17891491   211.215MB/s
+BM_memcpy/64           74 ns         74 ns    9400999   825.646MB/s
+BM_memcpy/512          87 ns         87 ns    8027453   5.46126GB/s
+BM_memcpy/1024        111 ns        111 ns    6116853    8.5648GB/s
+BM_memcpy/8192        657 ns        656 ns    1064679   11.6247GB/s
+RUNNING: ./a.out --benchmark_filter=BM_copy --benchmark_out=/tmp/tmpAvWcOM
+Run on (8 X 4000 MHz CPU s)
+2017-11-07 21:37:33
+----------------------------------------------------
+Benchmark             Time           CPU Iterations
+----------------------------------------------------
+BM_copy/8           227 ns        227 ns    3038700   33.6264MB/s
+BM_copy/64         1640 ns       1640 ns     426893   37.2154MB/s
+BM_copy/512       12804 ns      12801 ns      55417   38.1444MB/s
+BM_copy/1024      25409 ns      25407 ns      27516   38.4365MB/s
+BM_copy/8192     202986 ns     202990 ns       3454   38.4871MB/s
+Comparing BM_memcpy to BM_copy (from ./a.out)
+Benchmark                               Time             CPU      Time Old      Time New       CPU Old       CPU New
+--------------------------------------------------------------------------------------------------------------------
+[BM_memcpy vs. BM_copy]/8            +5.2829         +5.2812            36           227            36           227
+[BM_memcpy vs. BM_copy]/64          +21.1719        +21.1856            74          1640            74          1640
+[BM_memcpy vs. BM_copy]/512        +145.6487       +145.6097            87         12804            87         12801
+[BM_memcpy vs. BM_copy]/1024       +227.1860       +227.1776           111         25409           111         25407
+[BM_memcpy vs. BM_copy]/8192       +308.1664       +308.2898           657        202986           656        202990
+```
+
+As you can see, it applies filter to the benchmarks, both when running the benchmark, and before doing the diff. And to make the diff work, the matches are replaced with some common string. Thus, you can compare two different benchmark families within one benchmark binary.
+As you can note, the values in `Time` and `CPU` columns are calculated as `(new - old) / |old|`.
+
+3. Compare filter one from benchmark one to filter two from benchmark two:
+The program is invoked like:
+
+``` bash
+$ compare.py filters <benchmark_baseline> <filter_baseline> <benchmark_contender> <filter_contender> [benchmark options]...
+```
+
+Where `<benchmark_baseline>` and `<benchmark_contender>` either specify a benchmark executable file, or a JSON output file. The type of the input file is automatically detected. If a benchmark executable is specified then the benchmark is run to obtain the results. Otherwise the results are simply loaded from the output file.
+
+Where `<filter_baseline>` and `<filter_contender>` are the same regex filters that you would pass to the `[--benchmark_filter=<regex>]` parameter of the benchmark binary.
+
+`[benchmark options]` will be passed to the benchmarks invocations. They can be anything that binary accepts, be it either normal `--benchmark_*` parameters, or some custom parameters your binary takes.
+
+Example output:
+```
+$ ./compare.py benchmarksfiltered ./a.out BM_memcpy ./a.out BM_copy
+RUNNING: ./a.out --benchmark_filter=BM_memcpy --benchmark_out=/tmp/tmp_FvbYg
+Run on (8 X 4000 MHz CPU s)
+2017-11-07 21:38:27
+------------------------------------------------------
+Benchmark               Time           CPU Iterations
+------------------------------------------------------
+BM_memcpy/8            37 ns         37 ns   18953482   204.118MB/s
+BM_memcpy/64           74 ns         74 ns    9206578   828.245MB/s
+BM_memcpy/512          91 ns         91 ns    8086195   5.25476GB/s
+BM_memcpy/1024        120 ns        120 ns    5804513   7.95662GB/s
+BM_memcpy/8192        664 ns        664 ns    1028363   11.4948GB/s
+RUNNING: ./a.out --benchmark_filter=BM_copy --benchmark_out=/tmp/tmpDfL5iE
+Run on (8 X 4000 MHz CPU s)
+2017-11-07 21:38:32
+----------------------------------------------------
+Benchmark             Time           CPU Iterations
+----------------------------------------------------
+BM_copy/8           230 ns        230 ns    2985909   33.1161MB/s
+BM_copy/64         1654 ns       1653 ns     419408   36.9137MB/s
+BM_copy/512       13122 ns      13120 ns      53403   37.2156MB/s
+BM_copy/1024      26679 ns      26666 ns      26575   36.6218MB/s
+BM_copy/8192     215068 ns     215053 ns       3221   36.3283MB/s
+Comparing BM_memcpy (from ./a.out) to BM_copy (from ./a.out)
+Benchmark                               Time             CPU      Time Old      Time New       CPU Old       CPU New
+--------------------------------------------------------------------------------------------------------------------
+[BM_memcpy vs. BM_copy]/8            +5.1649         +5.1637            37           230            37           230
+[BM_memcpy vs. BM_copy]/64          +21.4352        +21.4374            74          1654            74          1653
+[BM_memcpy vs. BM_copy]/512        +143.6022       +143.5865            91         13122            91         13120
+[BM_memcpy vs. BM_copy]/1024       +221.5903       +221.4790           120         26679           120         26666
+[BM_memcpy vs. BM_copy]/8192       +322.9059       +323.0096           664        215068           664        215053
+```
+This is a mix of the previous two modes, two (potentially different) benchmark binaries are run, and a different filter is applied to each one.
+As you can note, the values in `Time` and `CPU` columns are calculated as `(new - old) / |old|`.
+
+### Note: Interpreting the output
+
+Performance measurements are an art, and performance comparisons are doubly so.
+Results are often noisy and don't necessarily have large absolute differences to
+them, so just by visual inspection, it is not at all apparent if two
+measurements are actually showing a performance change or not. It is even more
+confusing with multiple benchmark repetitions.
+
+Thankfully, what we can do, is use statistical tests on the results to determine
+whether the performance has statistically-significantly changed. `compare.py`
+uses [Mann–Whitney U
+test](https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test), with a null
+hypothesis being that there's no difference in performance.
+ 
+**The below output is a summary of a benchmark comparison with statistics
+provided for a multi-threaded process.**
+```
+Benchmark                                               Time        CPU    Time Old      Time New       CPU Old       CPU New
+-----------------------------------------------------------------------------------------------------------------------------
+benchmark/threads:1/process_time/real_time_pvalue     0.0000     0.0000    U Test, Repetitions: 27 vs 27
+benchmark/threads:1/process_time/real_time_mean      -0.1442    -0.1442          90            77            90            77
+benchmark/threads:1/process_time/real_time_median    -0.1444    -0.1444          90            77            90            77
+benchmark/threads:1/process_time/real_time_stddev    +0.3974    +0.3933           0             0             0             0
+benchmark/threads:1/process_time/real_time_cv        +0.6329    +0.6280           0             0             0             0
+OVERALL_GEOMEAN                                      -0.1442    -0.1442           0             0             0             0
+```
+--------------------------------------------
+Here's a breakdown of each row:
+
+**benchmark/threads:1/process_time/real_time_pvalue**: This shows the _p-value_ for
+the statistical test comparing the performance of the process running with one
+thread. A value of 0.0000 suggests a statistically significant difference in
+performance. The comparison was conducted using the U Test (Mann-Whitney
+U Test) with 27 repetitions for each case.
+
+**benchmark/threads:1/process_time/real_time_mean**: This shows the relative
+difference in mean execution time between two different cases. The negative
+value (-0.1442) implies that the new process is faster by about 14.42%. The old
+time was 90 units, while the new time is 77 units.
+
+**benchmark/threads:1/process_time/real_time_median**: Similarly, this shows the
+relative difference in the median execution time. Again, the new process is
+faster by 14.44%.
+
+**benchmark/threads:1/process_time/real_time_stddev**: This is the relative
+difference in the standard deviation of the execution time, which is a measure
+of how much variation or dispersion there is from the mean. A positive value
+(+0.3974) implies there is more variance in the execution time in the new
+process.
+
+**benchmark/threads:1/process_time/real_time_cv**: CV stands for Coefficient of
+Variation. It is the ratio of the standard deviation to the mean. It provides a
+standardized measure of dispersion. An increase (+0.6329) indicates more
+relative variability in the new process.
+
+**OVERALL_GEOMEAN**: Geomean stands for geometric mean, a type of average that is
+less influenced by outliers. The negative value indicates a general improvement
+in the new process. However, given the values are all zero for the old and new
+times, this seems to be a mistake or placeholder in the output.
+
+-----------------------------------------
+
+
+
+Let's first try to see what the different columns represent in the above
+`compare.py` benchmarking output:
+
+  1. **Benchmark:** The name of the function being benchmarked, along with the
+     size of the input (after the slash).
+
+  2. **Time:** The average time per operation, across all iterations.
+
+  3. **CPU:** The average CPU time per operation, across all iterations.
+
+  4. **Iterations:** The number of iterations the benchmark was run to get a
+     stable estimate.
+
+  5. **Time Old and Time New:** These represent the average time it takes for a
+     function to run in two different scenarios or versions. For example, you
+     might be comparing how fast a function runs before and after you make some
+     changes to it.
+
+  6. **CPU Old and CPU New:** These show the average amount of CPU time that the
+     function uses in two different scenarios or versions. This is similar to
+     Time Old and Time New, but focuses on CPU usage instead of overall time.
+
+In the comparison section, the relative differences in both time and CPU time
+are displayed for each input size.
+
+
+A statistically-significant difference is determined by a **p-value**, which is
+a measure of the probability that the observed difference could have occurred
+just by random chance. A smaller p-value indicates stronger evidence against the
+null hypothesis. 
+
+**Therefore:**
+  1. If the p-value is less than the chosen significance level (alpha), we
+     reject the null hypothesis and conclude the benchmarks are significantly
+     different.
+  2. If the p-value is greater than or equal to alpha, we fail to reject the
+     null hypothesis and treat the two benchmarks as similar.
+
+
+
+The result of said the statistical test is additionally communicated through color coding:
+```diff
++ Green:
+```
+  The benchmarks are _**statistically different**_. This could mean the
+  performance has either **significantly improved** or **significantly
+  deteriorated**. You should look at the actual performance numbers to see which
+  is the case.
+```diff
+- Red:
+```
+  The benchmarks are _**statistically similar**_. This means the performance
+  **hasn't significantly changed**.
+
+In statistical terms, **'green'** means we reject the null hypothesis that
+there's no difference in performance, and **'red'** means we fail to reject the
+null hypothesis. This might seem counter-intuitive if you're expecting 'green'
+to mean 'improved performance' and 'red' to mean 'worsened performance'. 
+```bash
+  But remember, in this context:
+
+    'Success' means 'successfully finding a difference'.
+    'Failure' means 'failing to find a difference'.
+```
+
+
+Also, please note that **even if** we determine that there **is** a
+statistically-significant difference between the two measurements, it does not
+_necessarily_ mean that the actual benchmarks that were measured **are**
+different, or vice versa, even if we determine that there is **no**
+statistically-significant difference between the two measurements, it does not
+necessarily mean that the actual benchmarks that were measured **are not**
+different.
+
+
+
+### U test
+
+If there is a sufficient repetition count of the benchmarks, the tool can do
+a [U Test](https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test), of the
+null hypothesis that it is equally likely that a randomly selected value from
+one sample will be less than or greater than a randomly selected value from a
+second sample.
+
+If the calculated p-value is below this value is lower than the significance
+level alpha, then the result is said to be statistically significant and the
+null hypothesis is rejected. Which in other words means that the two benchmarks
+aren't identical.
+
+**WARNING**: requires **LARGE** (no less than 9) number of repetitions to be
+meaningful!
diff --git a/ThirdParty/googlebenchmark/docs/user_guide.md b/ThirdParty/googlebenchmark/docs/user_guide.md
new file mode 100644
index 0000000000..2ceb13eb59
--- /dev/null
+++ b/ThirdParty/googlebenchmark/docs/user_guide.md
@@ -0,0 +1,1266 @@
+# User Guide
+
+## Command Line
+
+[Output Formats](#output-formats)
+
+[Output Files](#output-files)
+
+[Running Benchmarks](#running-benchmarks)
+
+[Running a Subset of Benchmarks](#running-a-subset-of-benchmarks)
+
+[Result Comparison](#result-comparison)
+
+[Extra Context](#extra-context)
+
+## Library
+
+[Runtime and Reporting Considerations](#runtime-and-reporting-considerations)
+
+[Setup/Teardown](#setupteardown)
+
+[Passing Arguments](#passing-arguments)
+
+[Custom Benchmark Name](#custom-benchmark-name)
+
+[Calculating Asymptotic Complexity](#asymptotic-complexity)
+
+[Templated Benchmarks](#templated-benchmarks)
+
+[Fixtures](#fixtures)
+
+[Custom Counters](#custom-counters)
+
+[Multithreaded Benchmarks](#multithreaded-benchmarks)
+
+[CPU Timers](#cpu-timers)
+
+[Manual Timing](#manual-timing)
+
+[Setting the Time Unit](#setting-the-time-unit)
+
+[Random Interleaving](random_interleaving.md)
+
+[User-Requested Performance Counters](perf_counters.md)
+
+[Preventing Optimization](#preventing-optimization)
+
+[Reporting Statistics](#reporting-statistics)
+
+[Custom Statistics](#custom-statistics)
+
+[Memory Usage](#memory-usage)
+
+[Using RegisterBenchmark](#using-register-benchmark)
+
+[Exiting with an Error](#exiting-with-an-error)
+
+[A Faster `KeepRunning` Loop](#a-faster-keep-running-loop)
+
+## Benchmarking Tips
+
+[Disabling CPU Frequency Scaling](#disabling-cpu-frequency-scaling)
+
+[Reducing Variance in Benchmarks](reducing_variance.md)
+
+<a name="output-formats" />
+
+## Output Formats
+
+The library supports multiple output formats. Use the
+`--benchmark_format=<console|json|csv>` flag (or set the
+`BENCHMARK_FORMAT=<console|json|csv>` environment variable) to set
+the format type. `console` is the default format.
+
+The Console format is intended to be a human readable format. By default
+the format generates color output. Context is output on stderr and the
+tabular data on stdout. Example tabular output looks like:
+
+```
+Benchmark                               Time(ns)    CPU(ns) Iterations
+----------------------------------------------------------------------
+BM_SetInsert/1024/1                        28928      29349      23853  133.097kB/s   33.2742k items/s
+BM_SetInsert/1024/8                        32065      32913      21375  949.487kB/s   237.372k items/s
+BM_SetInsert/1024/10                       33157      33648      21431  1.13369MB/s   290.225k items/s
+```
+
+The JSON format outputs human readable json split into two top level attributes.
+The `context` attribute contains information about the run in general, including
+information about the CPU and the date.
+The `benchmarks` attribute contains a list of every benchmark run. Example json
+output looks like:
+
+```json
+{
+  "context": {
+    "date": "2015/03/17-18:40:25",
+    "num_cpus": 40,
+    "mhz_per_cpu": 2801,
+    "cpu_scaling_enabled": false,
+    "build_type": "debug"
+  },
+  "benchmarks": [
+    {
+      "name": "BM_SetInsert/1024/1",
+      "iterations": 94877,
+      "real_time": 29275,
+      "cpu_time": 29836,
+      "bytes_per_second": 134066,
+      "items_per_second": 33516
+    },
+    {
+      "name": "BM_SetInsert/1024/8",
+      "iterations": 21609,
+      "real_time": 32317,
+      "cpu_time": 32429,
+      "bytes_per_second": 986770,
+      "items_per_second": 246693
+    },
+    {
+      "name": "BM_SetInsert/1024/10",
+      "iterations": 21393,
+      "real_time": 32724,
+      "cpu_time": 33355,
+      "bytes_per_second": 1199226,
+      "items_per_second": 299807
+    }
+  ]
+}
+```
+
+The CSV format outputs comma-separated values. The `context` is output on stderr
+and the CSV itself on stdout. Example CSV output looks like:
+
+```
+name,iterations,real_time,cpu_time,bytes_per_second,items_per_second,label
+"BM_SetInsert/1024/1",65465,17890.7,8407.45,475768,118942,
+"BM_SetInsert/1024/8",116606,18810.1,9766.64,3.27646e+06,819115,
+"BM_SetInsert/1024/10",106365,17238.4,8421.53,4.74973e+06,1.18743e+06,
+```
+
+<a name="output-files" />
+
+## Output Files
+
+Write benchmark results to a file with the `--benchmark_out=<filename>` option
+(or set `BENCHMARK_OUT`). Specify the output format with
+`--benchmark_out_format={json|console|csv}` (or set
+`BENCHMARK_OUT_FORMAT={json|console|csv}`). Note that the 'csv' reporter is
+deprecated and the saved `.csv` file
+[is not parsable](https://github.com/google/benchmark/issues/794) by csv
+parsers.
+
+Specifying `--benchmark_out` does not suppress the console output.
+
+<a name="running-benchmarks" />
+
+## Running Benchmarks
+
+Benchmarks are executed by running the produced binaries. Benchmarks binaries,
+by default, accept options that may be specified either through their command
+line interface or by setting environment variables before execution. For every
+`--option_flag=<value>` CLI switch, a corresponding environment variable
+`OPTION_FLAG=<value>` exist and is used as default if set (CLI switches always
+ prevails). A complete list of CLI options is available running benchmarks
+ with the `--help` switch.
+
+<a name="running-a-subset-of-benchmarks" />
+
+## Running a Subset of Benchmarks
+
+The `--benchmark_filter=<regex>` option (or `BENCHMARK_FILTER=<regex>`
+environment variable) can be used to only run the benchmarks that match
+the specified `<regex>`. For example:
+
+```bash
+$ ./run_benchmarks.x --benchmark_filter=BM_memcpy/32
+Run on (1 X 2300 MHz CPU )
+2016-06-25 19:34:24
+Benchmark              Time           CPU Iterations
+----------------------------------------------------
+BM_memcpy/32          11 ns         11 ns   79545455
+BM_memcpy/32k       2181 ns       2185 ns     324074
+BM_memcpy/32          12 ns         12 ns   54687500
+BM_memcpy/32k       1834 ns       1837 ns     357143
+```
+
+## Disabling Benchmarks
+
+It is possible to temporarily disable benchmarks by renaming the benchmark
+function to have the prefix "DISABLED_". This will cause the benchmark to
+be skipped at runtime.
+
+<a name="result-comparison" />
+
+## Result comparison
+
+It is possible to compare the benchmarking results.
+See [Additional Tooling Documentation](tools.md)
+
+<a name="extra-context" />
+
+## Extra Context
+
+Sometimes it's useful to add extra context to the content printed before the
+results. By default this section includes information about the CPU on which
+the benchmarks are running. If you do want to add more context, you can use
+the `benchmark_context` command line flag:
+
+```bash
+$ ./run_benchmarks --benchmark_context=pwd=`pwd`
+Run on (1 x 2300 MHz CPU)
+pwd: /home/user/benchmark/
+Benchmark              Time           CPU Iterations
+----------------------------------------------------
+BM_memcpy/32          11 ns         11 ns   79545455
+BM_memcpy/32k       2181 ns       2185 ns     324074
+```
+
+You can get the same effect with the API:
+
+```c++
+  benchmark::AddCustomContext("foo", "bar");
+```
+
+Note that attempts to add a second value with the same key will fail with an
+error message.
+
+<a name="runtime-and-reporting-considerations" />
+
+## Runtime and Reporting Considerations
+
+When the benchmark binary is executed, each benchmark function is run serially.
+The number of iterations to run is determined dynamically by running the
+benchmark a few times and measuring the time taken and ensuring that the
+ultimate result will be statistically stable. As such, faster benchmark
+functions will be run for more iterations than slower benchmark functions, and
+the number of iterations is thus reported.
+
+In all cases, the number of iterations for which the benchmark is run is
+governed by the amount of time the benchmark takes. Concretely, the number of
+iterations is at least one, not more than 1e9, until CPU time is greater than
+the minimum time, or the wallclock time is 5x minimum time. The minimum time is
+set per benchmark by calling `MinTime` on the registered benchmark object.
+
+Furthermore warming up a benchmark might be necessary in order to get
+stable results because of e.g caching effects of the code under benchmark.
+Warming up means running the benchmark a given amount of time, before
+results are actually taken into account. The amount of time for which
+the warmup should be run can be set per benchmark by calling
+`MinWarmUpTime` on the registered benchmark object or for all benchmarks
+using the `--benchmark_min_warmup_time` command-line option. Note that
+`MinWarmUpTime` will overwrite the value of `--benchmark_min_warmup_time`
+for the single benchmark. How many iterations the warmup run of each
+benchmark takes is determined the same way as described in the paragraph
+above. Per default the warmup phase is set to 0 seconds and is therefore
+disabled.
+
+Average timings are then reported over the iterations run. If multiple
+repetitions are requested using the `--benchmark_repetitions` command-line
+option, or at registration time, the benchmark function will be run several
+times and statistical results across these repetitions will also be reported.
+
+As well as the per-benchmark entries, a preamble in the report will include
+information about the machine on which the benchmarks are run.
+
+<a name="setup-teardown" />
+
+## Setup/Teardown
+
+Global setup/teardown specific to each benchmark can be done by
+passing a callback to Setup/Teardown:
+
+The setup/teardown callbacks will be invoked once for each benchmark. If the
+benchmark is multi-threaded (will run in k threads), they will be invoked
+exactly once before each run with k threads.
+
+If the benchmark uses different size groups of threads, the above will be true
+for each size group.
+
+Eg.,
+
+```c++
+static void DoSetup(const benchmark::State& state) {
+}
+
+static void DoTeardown(const benchmark::State& state) {
+}
+
+static void BM_func(benchmark::State& state) {...}
+
+BENCHMARK(BM_func)->Arg(1)->Arg(3)->Threads(16)->Threads(32)->Setup(DoSetup)->Teardown(DoTeardown);
+
+```
+
+In this example, `DoSetup` and `DoTearDown` will be invoked 4 times each,
+specifically, once for each of this family:
+ - BM_func_Arg_1_Threads_16, BM_func_Arg_1_Threads_32
+ - BM_func_Arg_3_Threads_16, BM_func_Arg_3_Threads_32
+
+<a name="passing-arguments" />
+
+## Passing Arguments
+
+Sometimes a family of benchmarks can be implemented with just one routine that
+takes an extra argument to specify which one of the family of benchmarks to
+run. For example, the following code defines a family of benchmarks for
+measuring the speed of `memcpy()` calls of different lengths:
+
+```c++
+static void BM_memcpy(benchmark::State& state) {
+  char* src = new char[state.range(0)];
+  char* dst = new char[state.range(0)];
+  memset(src, 'x', state.range(0));
+  for (auto _ : state)
+    memcpy(dst, src, state.range(0));
+  state.SetBytesProcessed(int64_t(state.iterations()) *
+                          int64_t(state.range(0)));
+  delete[] src;
+  delete[] dst;
+}
+BENCHMARK(BM_memcpy)->Arg(8)->Arg(64)->Arg(512)->Arg(4<<10)->Arg(8<<10);
+```
+
+The preceding code is quite repetitive, and can be replaced with the following
+short-hand. The following invocation will pick a few appropriate arguments in
+the specified range and will generate a benchmark for each such argument.
+
+```c++
+BENCHMARK(BM_memcpy)->Range(8, 8<<10);
+```
+
+By default the arguments in the range are generated in multiples of eight and
+the command above selects [ 8, 64, 512, 4k, 8k ]. In the following code the
+range multiplier is changed to multiples of two.
+
+```c++
+BENCHMARK(BM_memcpy)->RangeMultiplier(2)->Range(8, 8<<10);
+```
+
+Now arguments generated are [ 8, 16, 32, 64, 128, 256, 512, 1024, 2k, 4k, 8k ].
+
+The preceding code shows a method of defining a sparse range.  The following
+example shows a method of defining a dense range. It is then used to benchmark
+the performance of `std::vector` initialization for uniformly increasing sizes.
+
+```c++
+static void BM_DenseRange(benchmark::State& state) {
+  for(auto _ : state) {
+    std::vector<int> v(state.range(0), state.range(0));
+    auto data = v.data();
+    benchmark::DoNotOptimize(data);
+    benchmark::ClobberMemory();
+  }
+}
+BENCHMARK(BM_DenseRange)->DenseRange(0, 1024, 128);
+```
+
+Now arguments generated are [ 0, 128, 256, 384, 512, 640, 768, 896, 1024 ].
+
+You might have a benchmark that depends on two or more inputs. For example, the
+following code defines a family of benchmarks for measuring the speed of set
+insertion.
+
+```c++
+static void BM_SetInsert(benchmark::State& state) {
+  std::set<int> data;
+  for (auto _ : state) {
+    state.PauseTiming();
+    data = ConstructRandomSet(state.range(0));
+    state.ResumeTiming();
+    for (int j = 0; j < state.range(1); ++j)
+      data.insert(RandomNumber());
+  }
+}
+BENCHMARK(BM_SetInsert)
+    ->Args({1<<10, 128})
+    ->Args({2<<10, 128})
+    ->Args({4<<10, 128})
+    ->Args({8<<10, 128})
+    ->Args({1<<10, 512})
+    ->Args({2<<10, 512})
+    ->Args({4<<10, 512})
+    ->Args({8<<10, 512});
+```
+
+The preceding code is quite repetitive, and can be replaced with the following
+short-hand. The following macro will pick a few appropriate arguments in the
+product of the two specified ranges and will generate a benchmark for each such
+pair.
+
+<!-- {% raw %} -->
+```c++
+BENCHMARK(BM_SetInsert)->Ranges({{1<<10, 8<<10}, {128, 512}});
+```
+<!-- {% endraw %} -->
+
+Some benchmarks may require specific argument values that cannot be expressed
+with `Ranges`. In this case, `ArgsProduct` offers the ability to generate a
+benchmark input for each combination in the product of the supplied vectors.
+
+<!-- {% raw %} -->
+```c++
+BENCHMARK(BM_SetInsert)
+    ->ArgsProduct({{1<<10, 3<<10, 8<<10}, {20, 40, 60, 80}})
+// would generate the same benchmark arguments as
+BENCHMARK(BM_SetInsert)
+    ->Args({1<<10, 20})
+    ->Args({3<<10, 20})
+    ->Args({8<<10, 20})
+    ->Args({3<<10, 40})
+    ->Args({8<<10, 40})
+    ->Args({1<<10, 40})
+    ->Args({1<<10, 60})
+    ->Args({3<<10, 60})
+    ->Args({8<<10, 60})
+    ->Args({1<<10, 80})
+    ->Args({3<<10, 80})
+    ->Args({8<<10, 80});
+```
+<!-- {% endraw %} -->
+
+For the most common scenarios, helper methods for creating a list of
+integers for a given sparse or dense range are provided.
+
+```c++
+BENCHMARK(BM_SetInsert)
+    ->ArgsProduct({
+      benchmark::CreateRange(8, 128, /*multi=*/2),
+      benchmark::CreateDenseRange(1, 4, /*step=*/1)
+    })
+// would generate the same benchmark arguments as
+BENCHMARK(BM_SetInsert)
+    ->ArgsProduct({
+      {8, 16, 32, 64, 128},
+      {1, 2, 3, 4}
+    });
+```
+
+For more complex patterns of inputs, passing a custom function to `Apply` allows
+programmatic specification of an arbitrary set of arguments on which to run the
+benchmark. The following example enumerates a dense range on one parameter,
+and a sparse range on the second.
+
+```c++
+static void CustomArguments(benchmark::internal::Benchmark* b) {
+  for (int i = 0; i <= 10; ++i)
+    for (int j = 32; j <= 1024*1024; j *= 8)
+      b->Args({i, j});
+}
+BENCHMARK(BM_SetInsert)->Apply(CustomArguments);
+```
+
+### Passing Arbitrary Arguments to a Benchmark
+
+In C++11 it is possible to define a benchmark that takes an arbitrary number
+of extra arguments. The `BENCHMARK_CAPTURE(func, test_case_name, ...args)`
+macro creates a benchmark that invokes `func`  with the `benchmark::State` as
+the first argument followed by the specified `args...`.
+The `test_case_name` is appended to the name of the benchmark and
+should describe the values passed.
+
+```c++
+template <class ...Args>
+void BM_takes_args(benchmark::State& state, Args&&... args) {
+  auto args_tuple = std::make_tuple(std::move(args)...);
+  for (auto _ : state) {
+    std::cout << std::get<0>(args_tuple) << ": " << std::get<1>(args_tuple)
+              << '\n';
+    [...]
+  }
+}
+// Registers a benchmark named "BM_takes_args/int_string_test" that passes
+// the specified values to `args`.
+BENCHMARK_CAPTURE(BM_takes_args, int_string_test, 42, std::string("abc"));
+
+// Registers the same benchmark "BM_takes_args/int_test" that passes
+// the specified values to `args`.
+BENCHMARK_CAPTURE(BM_takes_args, int_test, 42, 43);
+```
+
+Note that elements of `...args` may refer to global variables. Users should
+avoid modifying global state inside of a benchmark.
+
+<a name="asymptotic-complexity" />
+
+## Calculating Asymptotic Complexity (Big O)
+
+Asymptotic complexity might be calculated for a family of benchmarks. The
+following code will calculate the coefficient for the high-order term in the
+running time and the normalized root-mean square error of string comparison.
+
+```c++
+static void BM_StringCompare(benchmark::State& state) {
+  std::string s1(state.range(0), '-');
+  std::string s2(state.range(0), '-');
+  for (auto _ : state) {
+    auto comparison_result = s1.compare(s2);
+    benchmark::DoNotOptimize(comparison_result);
+  }
+  state.SetComplexityN(state.range(0));
+}
+BENCHMARK(BM_StringCompare)
+    ->RangeMultiplier(2)->Range(1<<10, 1<<18)->Complexity(benchmark::oN);
+```
+
+As shown in the following invocation, asymptotic complexity might also be
+calculated automatically.
+
+```c++
+BENCHMARK(BM_StringCompare)
+    ->RangeMultiplier(2)->Range(1<<10, 1<<18)->Complexity();
+```
+
+The following code will specify asymptotic complexity with a lambda function,
+that might be used to customize high-order term calculation.
+
+```c++
+BENCHMARK(BM_StringCompare)->RangeMultiplier(2)
+    ->Range(1<<10, 1<<18)->Complexity([](benchmark::IterationCount n)->double{return n; });
+```
+
+<a name="custom-benchmark-name" />
+
+## Custom Benchmark Name
+
+You can change the benchmark's name as follows:
+
+```c++
+BENCHMARK(BM_memcpy)->Name("memcpy")->RangeMultiplier(2)->Range(8, 8<<10);
+```
+
+The invocation will execute the benchmark as before using `BM_memcpy` but changes
+the prefix in the report to `memcpy`.
+
+<a name="templated-benchmarks" />
+
+## Templated Benchmarks
+
+This example produces and consumes messages of size `sizeof(v)` `range_x`
+times. It also outputs throughput in the absence of multiprogramming.
+
+```c++
+template <class Q> void BM_Sequential(benchmark::State& state) {
+  Q q;
+  typename Q::value_type v;
+  for (auto _ : state) {
+    for (int i = state.range(0); i--; )
+      q.push(v);
+    for (int e = state.range(0); e--; )
+      q.Wait(&v);
+  }
+  // actually messages, not bytes:
+  state.SetBytesProcessed(
+      static_cast<int64_t>(state.iterations())*state.range(0));
+}
+// C++03
+BENCHMARK_TEMPLATE(BM_Sequential, WaitQueue<int>)->Range(1<<0, 1<<10);
+
+// C++11 or newer, you can use the BENCHMARK macro with template parameters:
+BENCHMARK(BM_Sequential<WaitQueue<int>>)->Range(1<<0, 1<<10);
+
+```
+
+Three macros are provided for adding benchmark templates.
+
+```c++
+#ifdef BENCHMARK_HAS_CXX11
+#define BENCHMARK(func<...>) // Takes any number of parameters.
+#else // C++ < C++11
+#define BENCHMARK_TEMPLATE(func, arg1)
+#endif
+#define BENCHMARK_TEMPLATE1(func, arg1)
+#define BENCHMARK_TEMPLATE2(func, arg1, arg2)
+```
+
+<a name="fixtures" />
+
+## Fixtures
+
+Fixture tests are created by first defining a type that derives from
+`::benchmark::Fixture` and then creating/registering the tests using the
+following macros:
+
+* `BENCHMARK_F(ClassName, Method)`
+* `BENCHMARK_DEFINE_F(ClassName, Method)`
+* `BENCHMARK_REGISTER_F(ClassName, Method)`
+
+For Example:
+
+```c++
+class MyFixture : public benchmark::Fixture {
+public:
+  void SetUp(const ::benchmark::State& state) {
+  }
+
+  void TearDown(const ::benchmark::State& state) {
+  }
+};
+
+BENCHMARK_F(MyFixture, FooTest)(benchmark::State& st) {
+   for (auto _ : st) {
+     ...
+  }
+}
+
+BENCHMARK_DEFINE_F(MyFixture, BarTest)(benchmark::State& st) {
+   for (auto _ : st) {
+     ...
+  }
+}
+/* BarTest is NOT registered */
+BENCHMARK_REGISTER_F(MyFixture, BarTest)->Threads(2);
+/* BarTest is now registered */
+```
+
+### Templated Fixtures
+
+Also you can create templated fixture by using the following macros:
+
+* `BENCHMARK_TEMPLATE_F(ClassName, Method, ...)`
+* `BENCHMARK_TEMPLATE_DEFINE_F(ClassName, Method, ...)`
+
+For example:
+
+```c++
+template<typename T>
+class MyFixture : public benchmark::Fixture {};
+
+BENCHMARK_TEMPLATE_F(MyFixture, IntTest, int)(benchmark::State& st) {
+   for (auto _ : st) {
+     ...
+  }
+}
+
+BENCHMARK_TEMPLATE_DEFINE_F(MyFixture, DoubleTest, double)(benchmark::State& st) {
+   for (auto _ : st) {
+     ...
+  }
+}
+
+BENCHMARK_REGISTER_F(MyFixture, DoubleTest)->Threads(2);
+```
+
+<a name="custom-counters" />
+
+## Custom Counters
+
+You can add your own counters with user-defined names. The example below
+will add columns "Foo", "Bar" and "Baz" in its output:
+
+```c++
+static void UserCountersExample1(benchmark::State& state) {
+  double numFoos = 0, numBars = 0, numBazs = 0;
+  for (auto _ : state) {
+    // ... count Foo,Bar,Baz events
+  }
+  state.counters["Foo"] = numFoos;
+  state.counters["Bar"] = numBars;
+  state.counters["Baz"] = numBazs;
+}
+```
+
+The `state.counters` object is a `std::map` with `std::string` keys
+and `Counter` values. The latter is a `double`-like class, via an implicit
+conversion to `double&`. Thus you can use all of the standard arithmetic
+assignment operators (`=,+=,-=,*=,/=`) to change the value of each counter.
+
+In multithreaded benchmarks, each counter is set on the calling thread only.
+When the benchmark finishes, the counters from each thread will be summed;
+the resulting sum is the value which will be shown for the benchmark.
+
+The `Counter` constructor accepts three parameters: the value as a `double`
+; a bit flag which allows you to show counters as rates, and/or as per-thread
+iteration, and/or as per-thread averages, and/or iteration invariants,
+and/or finally inverting the result; and a flag specifying the 'unit' - i.e.
+is 1k a 1000 (default, `benchmark::Counter::OneK::kIs1000`), or 1024
+(`benchmark::Counter::OneK::kIs1024`)?
+
+```c++
+  // sets a simple counter
+  state.counters["Foo"] = numFoos;
+
+  // Set the counter as a rate. It will be presented divided
+  // by the duration of the benchmark.
+  // Meaning: per one second, how many 'foo's are processed?
+  state.counters["FooRate"] = Counter(numFoos, benchmark::Counter::kIsRate);
+
+  // Set the counter as a rate. It will be presented divided
+  // by the duration of the benchmark, and the result inverted.
+  // Meaning: how many seconds it takes to process one 'foo'?
+  state.counters["FooInvRate"] = Counter(numFoos, benchmark::Counter::kIsRate | benchmark::Counter::kInvert);
+
+  // Set the counter as a thread-average quantity. It will
+  // be presented divided by the number of threads.
+  state.counters["FooAvg"] = Counter(numFoos, benchmark::Counter::kAvgThreads);
+
+  // There's also a combined flag:
+  state.counters["FooAvgRate"] = Counter(numFoos,benchmark::Counter::kAvgThreadsRate);
+
+  // This says that we process with the rate of state.range(0) bytes every iteration:
+  state.counters["BytesProcessed"] = Counter(state.range(0), benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::OneK::kIs1024);
+```
+
+When you're compiling in C++11 mode or later you can use `insert()` with
+`std::initializer_list`:
+
+<!-- {% raw %} -->
+```c++
+  // With C++11, this can be done:
+  state.counters.insert({{"Foo", numFoos}, {"Bar", numBars}, {"Baz", numBazs}});
+  // ... instead of:
+  state.counters["Foo"] = numFoos;
+  state.counters["Bar"] = numBars;
+  state.counters["Baz"] = numBazs;
+```
+<!-- {% endraw %} -->
+
+### Counter Reporting
+
+When using the console reporter, by default, user counters are printed at
+the end after the table, the same way as ``bytes_processed`` and
+``items_processed``. This is best for cases in which there are few counters,
+or where there are only a couple of lines per benchmark. Here's an example of
+the default output:
+
+```
+------------------------------------------------------------------------------
+Benchmark                        Time           CPU Iterations UserCounters...
+------------------------------------------------------------------------------
+BM_UserCounter/threads:8      2248 ns      10277 ns      68808 Bar=16 Bat=40 Baz=24 Foo=8
+BM_UserCounter/threads:1      9797 ns       9788 ns      71523 Bar=2 Bat=5 Baz=3 Foo=1024m
+BM_UserCounter/threads:2      4924 ns       9842 ns      71036 Bar=4 Bat=10 Baz=6 Foo=2
+BM_UserCounter/threads:4      2589 ns      10284 ns      68012 Bar=8 Bat=20 Baz=12 Foo=4
+BM_UserCounter/threads:8      2212 ns      10287 ns      68040 Bar=16 Bat=40 Baz=24 Foo=8
+BM_UserCounter/threads:16     1782 ns      10278 ns      68144 Bar=32 Bat=80 Baz=48 Foo=16
+BM_UserCounter/threads:32     1291 ns      10296 ns      68256 Bar=64 Bat=160 Baz=96 Foo=32
+BM_UserCounter/threads:4      2615 ns      10307 ns      68040 Bar=8 Bat=20 Baz=12 Foo=4
+BM_Factorial                    26 ns         26 ns   26608979 40320
+BM_Factorial/real_time          26 ns         26 ns   26587936 40320
+BM_CalculatePiRange/1           16 ns         16 ns   45704255 0
+BM_CalculatePiRange/8           73 ns         73 ns    9520927 3.28374
+BM_CalculatePiRange/64         609 ns        609 ns    1140647 3.15746
+BM_CalculatePiRange/512       4900 ns       4901 ns     142696 3.14355
+```
+
+If this doesn't suit you, you can print each counter as a table column by
+passing the flag `--benchmark_counters_tabular=true` to the benchmark
+application. This is best for cases in which there are a lot of counters, or
+a lot of lines per individual benchmark. Note that this will trigger a
+reprinting of the table header any time the counter set changes between
+individual benchmarks. Here's an example of corresponding output when
+`--benchmark_counters_tabular=true` is passed:
+
+```
+---------------------------------------------------------------------------------------
+Benchmark                        Time           CPU Iterations    Bar   Bat   Baz   Foo
+---------------------------------------------------------------------------------------
+BM_UserCounter/threads:8      2198 ns       9953 ns      70688     16    40    24     8
+BM_UserCounter/threads:1      9504 ns       9504 ns      73787      2     5     3     1
+BM_UserCounter/threads:2      4775 ns       9550 ns      72606      4    10     6     2
+BM_UserCounter/threads:4      2508 ns       9951 ns      70332      8    20    12     4
+BM_UserCounter/threads:8      2055 ns       9933 ns      70344     16    40    24     8
+BM_UserCounter/threads:16     1610 ns       9946 ns      70720     32    80    48    16
+BM_UserCounter/threads:32     1192 ns       9948 ns      70496     64   160    96    32
+BM_UserCounter/threads:4      2506 ns       9949 ns      70332      8    20    12     4
+--------------------------------------------------------------
+Benchmark                        Time           CPU Iterations
+--------------------------------------------------------------
+BM_Factorial                    26 ns         26 ns   26392245 40320
+BM_Factorial/real_time          26 ns         26 ns   26494107 40320
+BM_CalculatePiRange/1           15 ns         15 ns   45571597 0
+BM_CalculatePiRange/8           74 ns         74 ns    9450212 3.28374
+BM_CalculatePiRange/64         595 ns        595 ns    1173901 3.15746
+BM_CalculatePiRange/512       4752 ns       4752 ns     147380 3.14355
+BM_CalculatePiRange/4k       37970 ns      37972 ns      18453 3.14184
+BM_CalculatePiRange/32k     303733 ns     303744 ns       2305 3.14162
+BM_CalculatePiRange/256k   2434095 ns    2434186 ns        288 3.1416
+BM_CalculatePiRange/1024k  9721140 ns    9721413 ns         71 3.14159
+BM_CalculatePi/threads:8      2255 ns       9943 ns      70936
+```
+
+Note above the additional header printed when the benchmark changes from
+``BM_UserCounter`` to ``BM_Factorial``. This is because ``BM_Factorial`` does
+not have the same counter set as ``BM_UserCounter``.
+
+<a name="multithreaded-benchmarks"/>
+
+## Multithreaded Benchmarks
+
+In a multithreaded test (benchmark invoked by multiple threads simultaneously),
+it is guaranteed that none of the threads will start until all have reached
+the start of the benchmark loop, and all will have finished before any thread
+exits the benchmark loop. (This behavior is also provided by the `KeepRunning()`
+API) As such, any global setup or teardown can be wrapped in a check against the thread
+index:
+
+```c++
+static void BM_MultiThreaded(benchmark::State& state) {
+  if (state.thread_index() == 0) {
+    // Setup code here.
+  }
+  for (auto _ : state) {
+    // Run the test as normal.
+  }
+  if (state.thread_index() == 0) {
+    // Teardown code here.
+  }
+}
+BENCHMARK(BM_MultiThreaded)->Threads(2);
+```
+
+To run the benchmark across a range of thread counts, instead of `Threads`, use
+`ThreadRange`. This takes two parameters (`min_threads` and `max_threads`) and
+runs the benchmark once for values in the inclusive range. For example:
+
+```c++
+BENCHMARK(BM_MultiThreaded)->ThreadRange(1, 8);
+```
+
+will run `BM_MultiThreaded` with thread counts 1, 2, 4, and 8.
+
+If the benchmarked code itself uses threads and you want to compare it to
+single-threaded code, you may want to use real-time ("wallclock") measurements
+for latency comparisons:
+
+```c++
+BENCHMARK(BM_test)->Range(8, 8<<10)->UseRealTime();
+```
+
+Without `UseRealTime`, CPU time is used by default.
+
+<a name="cpu-timers" />
+
+## CPU Timers
+
+By default, the CPU timer only measures the time spent by the main thread.
+If the benchmark itself uses threads internally, this measurement may not
+be what you are looking for. Instead, there is a way to measure the total
+CPU usage of the process, by all the threads.
+
+```c++
+void callee(int i);
+
+static void MyMain(int size) {
+#pragma omp parallel for
+  for(int i = 0; i < size; i++)
+    callee(i);
+}
+
+static void BM_OpenMP(benchmark::State& state) {
+  for (auto _ : state)
+    MyMain(state.range(0));
+}
+
+// Measure the time spent by the main thread, use it to decide for how long to
+// run the benchmark loop. Depending on the internal implementation detail may
+// measure to anywhere from near-zero (the overhead spent before/after work
+// handoff to worker thread[s]) to the whole single-thread time.
+BENCHMARK(BM_OpenMP)->Range(8, 8<<10);
+
+// Measure the user-visible time, the wall clock (literally, the time that
+// has passed on the clock on the wall), use it to decide for how long to
+// run the benchmark loop. This will always be meaningful, and will match the
+// time spent by the main thread in single-threaded case, in general decreasing
+// with the number of internal threads doing the work.
+BENCHMARK(BM_OpenMP)->Range(8, 8<<10)->UseRealTime();
+
+// Measure the total CPU consumption, use it to decide for how long to
+// run the benchmark loop. This will always measure to no less than the
+// time spent by the main thread in single-threaded case.
+BENCHMARK(BM_OpenMP)->Range(8, 8<<10)->MeasureProcessCPUTime();
+
+// A mixture of the last two. Measure the total CPU consumption, but use the
+// wall clock to decide for how long to run the benchmark loop.
+BENCHMARK(BM_OpenMP)->Range(8, 8<<10)->MeasureProcessCPUTime()->UseRealTime();
+```
+
+### Controlling Timers
+
+Normally, the entire duration of the work loop (`for (auto _ : state) {}`)
+is measured. But sometimes, it is necessary to do some work inside of
+that loop, every iteration, but without counting that time to the benchmark time.
+That is possible, although it is not recommended, since it has high overhead.
+
+<!-- {% raw %} -->
+```c++
+static void BM_SetInsert_With_Timer_Control(benchmark::State& state) {
+  std::set<int> data;
+  for (auto _ : state) {
+    state.PauseTiming(); // Stop timers. They will not count until they are resumed.
+    data = ConstructRandomSet(state.range(0)); // Do something that should not be measured
+    state.ResumeTiming(); // And resume timers. They are now counting again.
+    // The rest will be measured.
+    for (int j = 0; j < state.range(1); ++j)
+      data.insert(RandomNumber());
+  }
+}
+BENCHMARK(BM_SetInsert_With_Timer_Control)->Ranges({{1<<10, 8<<10}, {128, 512}});
+```
+<!-- {% endraw %} -->
+
+<a name="manual-timing" />
+
+## Manual Timing
+
+For benchmarking something for which neither CPU time nor real-time are
+correct or accurate enough, completely manual timing is supported using
+the `UseManualTime` function.
+
+When `UseManualTime` is used, the benchmarked code must call
+`SetIterationTime` once per iteration of the benchmark loop to
+report the manually measured time.
+
+An example use case for this is benchmarking GPU execution (e.g. OpenCL
+or CUDA kernels, OpenGL or Vulkan or Direct3D draw calls), which cannot
+be accurately measured using CPU time or real-time. Instead, they can be
+measured accurately using a dedicated API, and these measurement results
+can be reported back with `SetIterationTime`.
+
+```c++
+static void BM_ManualTiming(benchmark::State& state) {
+  int microseconds = state.range(0);
+  std::chrono::duration<double, std::micro> sleep_duration {
+    static_cast<double>(microseconds)
+  };
+
+  for (auto _ : state) {
+    auto start = std::chrono::high_resolution_clock::now();
+    // Simulate some useful workload with a sleep
+    std::this_thread::sleep_for(sleep_duration);
+    auto end = std::chrono::high_resolution_clock::now();
+
+    auto elapsed_seconds =
+      std::chrono::duration_cast<std::chrono::duration<double>>(
+        end - start);
+
+    state.SetIterationTime(elapsed_seconds.count());
+  }
+}
+BENCHMARK(BM_ManualTiming)->Range(1, 1<<17)->UseManualTime();
+```
+
+<a name="setting-the-time-unit" />
+
+## Setting the Time Unit
+
+If a benchmark runs a few milliseconds it may be hard to visually compare the
+measured times, since the output data is given in nanoseconds per default. In
+order to manually set the time unit, you can specify it manually:
+
+```c++
+BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
+```
+
+Additionally the default time unit can be set globally with the
+`--benchmark_time_unit={ns|us|ms|s}` command line argument. The argument only
+affects benchmarks where the time unit is not set explicitly.
+
+<a name="preventing-optimization" />
+
+## Preventing Optimization
+
+To prevent a value or expression from being optimized away by the compiler
+the `benchmark::DoNotOptimize(...)` and `benchmark::ClobberMemory()`
+functions can be used.
+
+```c++
+static void BM_test(benchmark::State& state) {
+  for (auto _ : state) {
+      int x = 0;
+      for (int i=0; i < 64; ++i) {
+        benchmark::DoNotOptimize(x += i);
+      }
+  }
+}
+```
+
+`DoNotOptimize(<expr>)` forces the  *result* of `<expr>` to be stored in either
+memory or a register. For GNU based compilers it acts as read/write barrier
+for global memory. More specifically it forces the compiler to flush pending
+writes to memory and reload any other values as necessary.
+
+Note that `DoNotOptimize(<expr>)` does not prevent optimizations on `<expr>`
+in any way. `<expr>` may even be removed entirely when the result is already
+known. For example:
+
+```c++
+  /* Example 1: `<expr>` is removed entirely. */
+  int foo(int x) { return x + 42; }
+  while (...) DoNotOptimize(foo(0)); // Optimized to DoNotOptimize(42);
+
+  /*  Example 2: Result of '<expr>' is only reused */
+  int bar(int) __attribute__((const));
+  while (...) DoNotOptimize(bar(0)); // Optimized to:
+  // int __result__ = bar(0);
+  // while (...) DoNotOptimize(__result__);
+```
+
+The second tool for preventing optimizations is `ClobberMemory()`. In essence
+`ClobberMemory()` forces the compiler to perform all pending writes to global
+memory. Memory managed by block scope objects must be "escaped" using
+`DoNotOptimize(...)` before it can be clobbered. In the below example
+`ClobberMemory()` prevents the call to `v.push_back(42)` from being optimized
+away.
+
+```c++
+static void BM_vector_push_back(benchmark::State& state) {
+  for (auto _ : state) {
+    std::vector<int> v;
+    v.reserve(1);
+    auto data = v.data();           // Allow v.data() to be clobbered. Pass as non-const
+    benchmark::DoNotOptimize(data); // lvalue to avoid undesired compiler optimizations
+    v.push_back(42);
+    benchmark::ClobberMemory(); // Force 42 to be written to memory.
+  }
+}
+```
+
+Note that `ClobberMemory()` is only available for GNU or MSVC based compilers.
+
+<a name="reporting-statistics" />
+
+## Statistics: Reporting the Mean, Median and Standard Deviation / Coefficient of variation of Repeated Benchmarks
+
+By default each benchmark is run once and that single result is reported.
+However benchmarks are often noisy and a single result may not be representative
+of the overall behavior. For this reason it's possible to repeatedly rerun the
+benchmark.
+
+The number of runs of each benchmark is specified globally by the
+`--benchmark_repetitions` flag or on a per benchmark basis by calling
+`Repetitions` on the registered benchmark object. When a benchmark is run more
+than once the mean, median, standard deviation and coefficient of variation
+of the runs will be reported.
+
+Additionally the `--benchmark_report_aggregates_only={true|false}`,
+`--benchmark_display_aggregates_only={true|false}` flags or
+`ReportAggregatesOnly(bool)`, `DisplayAggregatesOnly(bool)` functions can be
+used to change how repeated tests are reported. By default the result of each
+repeated run is reported. When `report aggregates only` option is `true`,
+only the aggregates (i.e. mean, median, standard deviation and coefficient
+of variation, maybe complexity measurements if they were requested) of the runs
+is reported, to both the reporters - standard output (console), and the file.
+However when only the `display aggregates only` option is `true`,
+only the aggregates are displayed in the standard output, while the file
+output still contains everything.
+Calling `ReportAggregatesOnly(bool)` / `DisplayAggregatesOnly(bool)` on a
+registered benchmark object overrides the value of the appropriate flag for that
+benchmark.
+
+<a name="custom-statistics" />
+
+## Custom Statistics
+
+While having these aggregates is nice, this may not be enough for everyone.
+For example you may want to know what the largest observation is, e.g. because
+you have some real-time constraints. This is easy. The following code will
+specify a custom statistic to be calculated, defined by a lambda function.
+
+```c++
+void BM_spin_empty(benchmark::State& state) {
+  for (auto _ : state) {
+    for (int x = 0; x < state.range(0); ++x) {
+      benchmark::DoNotOptimize(x);
+    }
+  }
+}
+
+BENCHMARK(BM_spin_empty)
+  ->ComputeStatistics("max", [](const std::vector<double>& v) -> double {
+    return *(std::max_element(std::begin(v), std::end(v)));
+  })
+  ->Arg(512);
+```
+
+While usually the statistics produce values in time units,
+you can also produce percentages:
+
+```c++
+void BM_spin_empty(benchmark::State& state) {
+  for (auto _ : state) {
+    for (int x = 0; x < state.range(0); ++x) {
+      benchmark::DoNotOptimize(x);
+    }
+  }
+}
+
+BENCHMARK(BM_spin_empty)
+  ->ComputeStatistics("ratio", [](const std::vector<double>& v) -> double {
+    return std::begin(v) / std::end(v);
+  }, benchmark::StatisticUnit::kPercentage)
+  ->Arg(512);
+```
+
+<a name="memory-usage" />
+
+## Memory Usage
+
+It's often useful to also track memory usage for benchmarks, alongside CPU
+performance. For this reason, benchmark offers the `RegisterMemoryManager`
+method that allows a custom `MemoryManager` to be injected.
+
+If set, the `MemoryManager::Start` and `MemoryManager::Stop` methods will be
+called at the start and end of benchmark runs to allow user code to fill out
+a report on the number of allocations, bytes used, etc.
+
+This data will then be reported alongside other performance data, currently
+only when using JSON output.
+
+<a name="using-register-benchmark" />
+
+## Using RegisterBenchmark(name, fn, args...)
+
+The `RegisterBenchmark(name, func, args...)` function provides an alternative
+way to create and register benchmarks.
+`RegisterBenchmark(name, func, args...)` creates, registers, and returns a
+pointer to a new benchmark with the specified `name` that invokes
+`func(st, args...)` where `st` is a `benchmark::State` object.
+
+Unlike the `BENCHMARK` registration macros, which can only be used at the global
+scope, the `RegisterBenchmark` can be called anywhere. This allows for
+benchmark tests to be registered programmatically.
+
+Additionally `RegisterBenchmark` allows any callable object to be registered
+as a benchmark. Including capturing lambdas and function objects.
+
+For Example:
+```c++
+auto BM_test = [](benchmark::State& st, auto Inputs) { /* ... */ };
+
+int main(int argc, char** argv) {
+  for (auto& test_input : { /* ... */ })
+      benchmark::RegisterBenchmark(test_input.name(), BM_test, test_input);
+  benchmark::Initialize(&argc, argv);
+  benchmark::RunSpecifiedBenchmarks();
+  benchmark::Shutdown();
+}
+```
+
+<a name="exiting-with-an-error" />
+
+## Exiting with an Error
+
+When errors caused by external influences, such as file I/O and network
+communication, occur within a benchmark the
+`State::SkipWithError(const std::string& msg)` function can be used to skip that run
+of benchmark and report the error. Note that only future iterations of the
+`KeepRunning()` are skipped. For the ranged-for version of the benchmark loop
+Users must explicitly exit the loop, otherwise all iterations will be performed.
+Users may explicitly return to exit the benchmark immediately.
+
+The `SkipWithError(...)` function may be used at any point within the benchmark,
+including before and after the benchmark loop. Moreover, if `SkipWithError(...)`
+has been used, it is not required to reach the benchmark loop and one may return
+from the benchmark function early.
+
+For example:
+
+```c++
+static void BM_test(benchmark::State& state) {
+  auto resource = GetResource();
+  if (!resource.good()) {
+    state.SkipWithError("Resource is not good!");
+    // KeepRunning() loop will not be entered.
+  }
+  while (state.KeepRunning()) {
+    auto data = resource.read_data();
+    if (!resource.good()) {
+      state.SkipWithError("Failed to read data!");
+      break; // Needed to skip the rest of the iteration.
+    }
+    do_stuff(data);
+  }
+}
+
+static void BM_test_ranged_fo(benchmark::State & state) {
+  auto resource = GetResource();
+  if (!resource.good()) {
+    state.SkipWithError("Resource is not good!");
+    return; // Early return is allowed when SkipWithError() has been used.
+  }
+  for (auto _ : state) {
+    auto data = resource.read_data();
+    if (!resource.good()) {
+      state.SkipWithError("Failed to read data!");
+      break; // REQUIRED to prevent all further iterations.
+    }
+    do_stuff(data);
+  }
+}
+```
+<a name="a-faster-keep-running-loop" />
+
+## A Faster KeepRunning Loop
+
+In C++11 mode, a ranged-based for loop should be used in preference to
+the `KeepRunning` loop for running the benchmarks. For example:
+
+```c++
+static void BM_Fast(benchmark::State &state) {
+  for (auto _ : state) {
+    FastOperation();
+  }
+}
+BENCHMARK(BM_Fast);
+```
+
+The reason the ranged-for loop is faster than using `KeepRunning`, is
+because `KeepRunning` requires a memory load and store of the iteration count
+ever iteration, whereas the ranged-for variant is able to keep the iteration count
+in a register.
+
+For example, an empty inner loop of using the ranged-based for method looks like:
+
+```asm
+# Loop Init
+  mov rbx, qword ptr [r14 + 104]
+  call benchmark::State::StartKeepRunning()
+  test rbx, rbx
+  je .LoopEnd
+.LoopHeader: # =>This Inner Loop Header: Depth=1
+  add rbx, -1
+  jne .LoopHeader
+.LoopEnd:
+```
+
+Compared to an empty `KeepRunning` loop, which looks like:
+
+```asm
+.LoopHeader: # in Loop: Header=BB0_3 Depth=1
+  cmp byte ptr [rbx], 1
+  jne .LoopInit
+.LoopBody: # =>This Inner Loop Header: Depth=1
+  mov rax, qword ptr [rbx + 8]
+  lea rcx, [rax + 1]
+  mov qword ptr [rbx + 8], rcx
+  cmp rax, qword ptr [rbx + 104]
+  jb .LoopHeader
+  jmp .LoopEnd
+.LoopInit:
+  mov rdi, rbx
+  call benchmark::State::StartKeepRunning()
+  jmp .LoopBody
+.LoopEnd:
+```
+
+Unless C++03 compatibility is required, the ranged-for variant of writing
+the benchmark loop should be preferred.
+
+<a name="disabling-cpu-frequency-scaling" />
+
+## Disabling CPU Frequency Scaling
+
+If you see this error:
+
+```
+***WARNING*** CPU scaling is enabled, the benchmark real time measurements may
+be noisy and will incur extra overhead.
+```
+
+you might want to disable the CPU frequency scaling while running the
+benchmark, as well as consider other ways to stabilize the performance of
+your system while benchmarking.
+
+See [Reducing Variance](reducing_variance.md) for more information.
diff --git a/ThirdParty/googlebenchmark/include/benchmark/benchmark.h b/ThirdParty/googlebenchmark/include/benchmark/benchmark.h
index 6cb96f546d..e3857e717f 100644
--- a/ThirdParty/googlebenchmark/include/benchmark/benchmark.h
+++ b/ThirdParty/googlebenchmark/include/benchmark/benchmark.h
@@ -34,7 +34,7 @@ static void BM_StringCopy(benchmark::State& state) {
 BENCHMARK(BM_StringCopy);
 
 // Augment the main() program to invoke benchmarks if specified
-// via the --benchmarks command line flag.  E.g.,
+// via the --benchmark_filter command line flag.  E.g.,
 //       my_unittest --benchmark_filter=all
 //       my_unittest --benchmark_filter=BM_StringCreation
 //       my_unittest --benchmark_filter=String
@@ -42,6 +42,7 @@ BENCHMARK(BM_StringCopy);
 int main(int argc, char** argv) {
   benchmark::Initialize(&argc, argv);
   benchmark::RunSpecifiedBenchmarks();
+  benchmark::Shutdown();
   return 0;
 }
 
@@ -139,13 +140,13 @@ thread exits the loop body. As such, any global setup or teardown you want to
 do can be wrapped in a check against the thread index:
 
 static void BM_MultiThreaded(benchmark::State& state) {
-  if (state.thread_index == 0) {
+  if (state.thread_index() == 0) {
     // Setup code here.
   }
   for (auto _ : state) {
     // Run the test as normal.
   }
-  if (state.thread_index == 0) {
+  if (state.thread_index() == 0) {
     // Teardown code here.
   }
 }
@@ -167,18 +168,29 @@ BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
 #define BENCHMARK_HAS_CXX11
 #endif
 
+// This _MSC_VER check should detect VS 2017 v15.3 and newer.
+#if __cplusplus >= 201703L || \
+    (defined(_MSC_VER) && _MSC_VER >= 1911 && _MSVC_LANG >= 201703L)
+#define BENCHMARK_HAS_CXX17
+#endif
+
 #include <stdint.h>
 
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <iosfwd>
+#include <limits>
 #include <map>
 #include <set>
 #include <string>
+#include <utility>
 #include <vector>
 
+#include "benchmark/export.h"
+
 #if defined(BENCHMARK_HAS_CXX11)
+#include <atomic>
 #include <initializer_list>
 #include <type_traits>
 #include <utility>
@@ -198,42 +210,63 @@ BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
   TypeName& operator=(const TypeName&) = delete
 #endif
 
-#if defined(__GNUC__)
+#ifdef BENCHMARK_HAS_CXX17
+#define BENCHMARK_UNUSED [[maybe_unused]]
+#elif defined(__GNUC__) || defined(__clang__)
 #define BENCHMARK_UNUSED __attribute__((unused))
-#define BENCHMARK_ALWAYS_INLINE __attribute__((always_inline))
-#define BENCHMARK_NOEXCEPT noexcept
-#define BENCHMARK_NOEXCEPT_OP(x) noexcept(x)
-#elif defined(_MSC_VER) && !defined(__clang__)
+#else
 #define BENCHMARK_UNUSED
-#define BENCHMARK_ALWAYS_INLINE __forceinline
-#if _MSC_VER >= 1900
-#define BENCHMARK_NOEXCEPT noexcept
-#define BENCHMARK_NOEXCEPT_OP(x) noexcept(x)
+#endif
+
+// Used to annotate functions, methods and classes so they
+// are not optimized by the compiler. Useful for tests
+// where you expect loops to stay in place churning cycles
+#if defined(__clang__)
+#define BENCHMARK_DONT_OPTIMIZE __attribute__((optnone))
+#elif defined(__GNUC__) || defined(__GNUG__)
+#define BENCHMARK_DONT_OPTIMIZE __attribute__((optimize(0)))
 #else
-#define BENCHMARK_NOEXCEPT
-#define BENCHMARK_NOEXCEPT_OP(x)
+// MSVC & Intel do not have a no-optimize attribute, only line pragmas
+#define BENCHMARK_DONT_OPTIMIZE
 #endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#define BENCHMARK_ALWAYS_INLINE __attribute__((always_inline))
+#elif defined(_MSC_VER) && !defined(__clang__)
+#define BENCHMARK_ALWAYS_INLINE __forceinline
 #define __func__ __FUNCTION__
 #else
-#define BENCHMARK_UNUSED
 #define BENCHMARK_ALWAYS_INLINE
-#define BENCHMARK_NOEXCEPT
-#define BENCHMARK_NOEXCEPT_OP(x)
 #endif
 
 #define BENCHMARK_INTERNAL_TOSTRING2(x) #x
 #define BENCHMARK_INTERNAL_TOSTRING(x) BENCHMARK_INTERNAL_TOSTRING2(x)
 
-#if defined(__GNUC__) || defined(__clang__)
+// clang-format off
+#if (defined(__GNUC__) && !defined(__NVCC__) && !defined(__NVCOMPILER)) || defined(__clang__)
 #define BENCHMARK_BUILTIN_EXPECT(x, y) __builtin_expect(x, y)
 #define BENCHMARK_DEPRECATED_MSG(msg) __attribute__((deprecated(msg)))
+#define BENCHMARK_DISABLE_DEPRECATED_WARNING \
+  _Pragma("GCC diagnostic push")             \
+  _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
+#define BENCHMARK_RESTORE_DEPRECATED_WARNING _Pragma("GCC diagnostic pop")
+#elif defined(__NVCOMPILER)
+#define BENCHMARK_BUILTIN_EXPECT(x, y) __builtin_expect(x, y)
+#define BENCHMARK_DEPRECATED_MSG(msg) __attribute__((deprecated(msg)))
+#define BENCHMARK_DISABLE_DEPRECATED_WARNING \
+  _Pragma("diagnostic push") \
+  _Pragma("diag_suppress deprecated_entity_with_custom_message")
+#define BENCHMARK_RESTORE_DEPRECATED_WARNING _Pragma("diagnostic pop")
 #else
 #define BENCHMARK_BUILTIN_EXPECT(x, y) x
 #define BENCHMARK_DEPRECATED_MSG(msg)
 #define BENCHMARK_WARNING_MSG(msg)                           \
   __pragma(message(__FILE__ "(" BENCHMARK_INTERNAL_TOSTRING( \
       __LINE__) ") : warning note: " msg))
+#define BENCHMARK_DISABLE_DEPRECATED_WARNING
+#define BENCHMARK_RESTORE_DEPRECATED_WARNING
 #endif
+// clang-format on
 
 #if defined(__GNUC__) && !defined(__clang__)
 #define BENCHMARK_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
@@ -251,21 +284,60 @@ BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
 #define BENCHMARK_UNREACHABLE() ((void)0)
 #endif
 
+#ifdef BENCHMARK_HAS_CXX11
+#define BENCHMARK_OVERRIDE override
+#else
+#define BENCHMARK_OVERRIDE
+#endif
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+// C4251: <symbol> needs to have dll-interface to be used by clients of class
+#pragma warning(disable : 4251)
+#endif
+
 namespace benchmark {
 class BenchmarkReporter;
-class MemoryManager;
 
-void Initialize(int* argc, char** argv);
+// Default number of minimum benchmark running time in seconds.
+const char kDefaultMinTimeStr[] = "0.5s";
+
+BENCHMARK_EXPORT void PrintDefaultHelp();
+
+BENCHMARK_EXPORT void Initialize(int* argc, char** argv,
+                                 void (*HelperPrinterf)() = PrintDefaultHelp);
+BENCHMARK_EXPORT void Shutdown();
 
 // Report to stdout all arguments in 'argv' as unrecognized except the first.
 // Returns true there is at least on unrecognized argument (i.e. 'argc' > 1).
-bool ReportUnrecognizedArguments(int argc, char** argv);
+BENCHMARK_EXPORT bool ReportUnrecognizedArguments(int argc, char** argv);
+
+// Returns the current value of --benchmark_filter.
+BENCHMARK_EXPORT std::string GetBenchmarkFilter();
+
+// Sets a new value to --benchmark_filter. (This will override this flag's
+// current value).
+// Should be called after `benchmark::Initialize()`, as
+// `benchmark::Initialize()` will override the flag's value.
+BENCHMARK_EXPORT void SetBenchmarkFilter(std::string value);
+
+// Returns the current value of --v (command line value for verbosity).
+BENCHMARK_EXPORT int32_t GetBenchmarkVerbosity();
+
+// Creates a default display reporter. Used by the library when no display
+// reporter is provided, but also made available for external use in case a
+// custom reporter should respect the `--benchmark_format` flag as a fallback
+BENCHMARK_EXPORT BenchmarkReporter* CreateDefaultDisplayReporter();
 
 // Generate a list of benchmarks matching the specified --benchmark_filter flag
 // and if --benchmark_list_tests is specified return after printing the name
 // of each matching benchmark. Otherwise run each matching benchmark and
 // report the results.
 //
+// spec : Specify the benchmarks to run. If users do not specify this arg,
+//        then the value of FLAGS_benchmark_filter
+//        will be used.
+//
 // The second and third overload use the specified 'display_reporter' and
 //  'file_reporter' respectively. 'file_reporter' will write to the file
 //  specified
@@ -273,28 +345,94 @@ bool ReportUnrecognizedArguments(int argc, char** argv);
 //  'file_reporter' is ignored.
 //
 // RETURNS: The number of matching benchmarks.
-size_t RunSpecifiedBenchmarks();
-size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter);
-size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
-                              BenchmarkReporter* file_reporter);
+BENCHMARK_EXPORT size_t RunSpecifiedBenchmarks();
+BENCHMARK_EXPORT size_t RunSpecifiedBenchmarks(std::string spec);
+
+BENCHMARK_EXPORT size_t
+RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter);
+BENCHMARK_EXPORT size_t
+RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter, std::string spec);
+
+BENCHMARK_EXPORT size_t RunSpecifiedBenchmarks(
+    BenchmarkReporter* display_reporter, BenchmarkReporter* file_reporter);
+BENCHMARK_EXPORT size_t
+RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
+                       BenchmarkReporter* file_reporter, std::string spec);
+
+// TimeUnit is passed to a benchmark in order to specify the order of magnitude
+// for the measured time.
+enum TimeUnit { kNanosecond, kMicrosecond, kMillisecond, kSecond };
+
+BENCHMARK_EXPORT TimeUnit GetDefaultTimeUnit();
+
+// Sets the default time unit the benchmarks use
+// Has to be called before the benchmark loop to take effect
+BENCHMARK_EXPORT void SetDefaultTimeUnit(TimeUnit unit);
+
+// If a MemoryManager is registered (via RegisterMemoryManager()),
+// it can be used to collect and report allocation metrics for a run of the
+// benchmark.
+class MemoryManager {
+ public:
+  static const int64_t TombstoneValue;
+
+  struct Result {
+    Result()
+        : num_allocs(0),
+          max_bytes_used(0),
+          total_allocated_bytes(TombstoneValue),
+          net_heap_growth(TombstoneValue) {}
+
+    // The number of allocations made in total between Start and Stop.
+    int64_t num_allocs;
+
+    // The peak memory use between Start and Stop.
+    int64_t max_bytes_used;
+
+    // The total memory allocated, in bytes, between Start and Stop.
+    // Init'ed to TombstoneValue if metric not available.
+    int64_t total_allocated_bytes;
+
+    // The net changes in memory, in bytes, between Start and Stop.
+    // ie., total_allocated_bytes - total_deallocated_bytes.
+    // Init'ed to TombstoneValue if metric not available.
+    int64_t net_heap_growth;
+  };
+
+  virtual ~MemoryManager() {}
+
+  // Implement this to start recording allocation information.
+  virtual void Start() = 0;
+
+  // Implement this to stop recording and fill out the given Result structure.
+  virtual void Stop(Result& result) = 0;
+};
 
 // Register a MemoryManager instance that will be used to collect and report
 // allocation measurements for benchmark runs.
+BENCHMARK_EXPORT
 void RegisterMemoryManager(MemoryManager* memory_manager);
 
+// Add a key-value pair to output as part of the context stanza in the report.
+BENCHMARK_EXPORT
+void AddCustomContext(const std::string& key, const std::string& value);
+
 namespace internal {
 class Benchmark;
 class BenchmarkImp;
 class BenchmarkFamilies;
 
+BENCHMARK_EXPORT std::map<std::string, std::string>*& GetGlobalContext();
+
+BENCHMARK_EXPORT
 void UseCharPointer(char const volatile*);
 
 // Take ownership of the pointer and register the benchmark. Return the
 // registered benchmark.
-Benchmark* RegisterBenchmarkInternal(Benchmark*);
+BENCHMARK_EXPORT Benchmark* RegisterBenchmarkInternal(Benchmark*);
 
 // Ensure that the standard streams are properly initialized in every TU.
-int InitializeStreams();
+BENCHMARK_EXPORT int InitializeStreams();
 BENCHMARK_UNUSED static int stream_init_anchor = InitializeStreams();
 
 }  // namespace internal
@@ -304,12 +442,24 @@ BENCHMARK_UNUSED static int stream_init_anchor = InitializeStreams();
 #define BENCHMARK_HAS_NO_INLINE_ASSEMBLY
 #endif
 
+// Force the compiler to flush pending writes to global memory. Acts as an
+// effective read/write barrier
+#ifdef BENCHMARK_HAS_CXX11
+inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() {
+  std::atomic_signal_fence(std::memory_order_acq_rel);
+}
+#endif
+
 // The DoNotOptimize(...) function can be used to prevent a value or
 // expression from being optimized away by the compiler. This function is
 // intended to add little to no overhead.
 // See: https://youtu.be/nXaxk27zwlk?t=2441
 #ifndef BENCHMARK_HAS_NO_INLINE_ASSEMBLY
+#if !defined(__GNUC__) || defined(__llvm__) || defined(__INTEL_COMPILER)
 template <class Tp>
+BENCHMARK_DEPRECATED_MSG(
+    "The const-ref version of this method can permit "
+    "undesired compiler optimizations in benchmarks")
 inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
   asm volatile("" : : "r,m"(value) : "memory");
 }
@@ -323,25 +473,125 @@ inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp& value) {
 #endif
 }
 
-// Force the compiler to flush pending writes to global memory. Acts as an
-// effective read/write barrier
+#ifdef BENCHMARK_HAS_CXX11
+template <class Tp>
+inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp&& value) {
+#if defined(__clang__)
+  asm volatile("" : "+r,m"(value) : : "memory");
+#else
+  asm volatile("" : "+m,r"(value) : : "memory");
+#endif
+}
+#endif
+#elif defined(BENCHMARK_HAS_CXX11) && (__GNUC__ >= 5)
+// Workaround for a bug with full argument copy overhead with GCC.
+// See: #1340 and https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105519
+template <class Tp>
+BENCHMARK_DEPRECATED_MSG(
+    "The const-ref version of this method can permit "
+    "undesired compiler optimizations in benchmarks")
+inline BENCHMARK_ALWAYS_INLINE
+    typename std::enable_if<std::is_trivially_copyable<Tp>::value &&
+                            (sizeof(Tp) <= sizeof(Tp*))>::type
+    DoNotOptimize(Tp const& value) {
+  asm volatile("" : : "r,m"(value) : "memory");
+}
+
+template <class Tp>
+BENCHMARK_DEPRECATED_MSG(
+    "The const-ref version of this method can permit "
+    "undesired compiler optimizations in benchmarks")
+inline BENCHMARK_ALWAYS_INLINE
+    typename std::enable_if<!std::is_trivially_copyable<Tp>::value ||
+                            (sizeof(Tp) > sizeof(Tp*))>::type
+    DoNotOptimize(Tp const& value) {
+  asm volatile("" : : "m"(value) : "memory");
+}
+
+template <class Tp>
+inline BENCHMARK_ALWAYS_INLINE
+    typename std::enable_if<std::is_trivially_copyable<Tp>::value &&
+                            (sizeof(Tp) <= sizeof(Tp*))>::type
+    DoNotOptimize(Tp& value) {
+  asm volatile("" : "+m,r"(value) : : "memory");
+}
+
+template <class Tp>
+inline BENCHMARK_ALWAYS_INLINE
+    typename std::enable_if<!std::is_trivially_copyable<Tp>::value ||
+                            (sizeof(Tp) > sizeof(Tp*))>::type
+    DoNotOptimize(Tp& value) {
+  asm volatile("" : "+m"(value) : : "memory");
+}
+
+template <class Tp>
+inline BENCHMARK_ALWAYS_INLINE
+    typename std::enable_if<std::is_trivially_copyable<Tp>::value &&
+                            (sizeof(Tp) <= sizeof(Tp*))>::type
+    DoNotOptimize(Tp&& value) {
+  asm volatile("" : "+m,r"(value) : : "memory");
+}
+
+template <class Tp>
+inline BENCHMARK_ALWAYS_INLINE
+    typename std::enable_if<!std::is_trivially_copyable<Tp>::value ||
+                            (sizeof(Tp) > sizeof(Tp*))>::type
+    DoNotOptimize(Tp&& value) {
+  asm volatile("" : "+m"(value) : : "memory");
+}
+
+#else
+// Fallback for GCC < 5. Can add some overhead because the compiler is forced
+// to use memory operations instead of operations with registers.
+// TODO: Remove if GCC < 5 will be unsupported.
+template <class Tp>
+BENCHMARK_DEPRECATED_MSG(
+    "The const-ref version of this method can permit "
+    "undesired compiler optimizations in benchmarks")
+inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
+  asm volatile("" : : "m"(value) : "memory");
+}
+
+template <class Tp>
+inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp& value) {
+  asm volatile("" : "+m"(value) : : "memory");
+}
+
+#ifdef BENCHMARK_HAS_CXX11
+template <class Tp>
+inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp&& value) {
+  asm volatile("" : "+m"(value) : : "memory");
+}
+#endif
+#endif
+
+#ifndef BENCHMARK_HAS_CXX11
 inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() {
   asm volatile("" : : : "memory");
 }
+#endif
 #elif defined(_MSC_VER)
 template <class Tp>
+BENCHMARK_DEPRECATED_MSG(
+    "The const-ref version of this method can permit "
+    "undesired compiler optimizations in benchmarks")
 inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
   internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value));
   _ReadWriteBarrier();
 }
 
+#ifndef BENCHMARK_HAS_CXX11
 inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() { _ReadWriteBarrier(); }
+#endif
 #else
 template <class Tp>
+BENCHMARK_DEPRECATED_MSG(
+    "The const-ref version of this method can permit "
+    "undesired compiler optimizations in benchmarks")
 inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
   internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value));
 }
-// FIXME Add ClobberMemory() for non-gnu and non-msvc compilers
+// FIXME Add ClobberMemory() for non-gnu and non-msvc compilers, before C++11.
 #endif
 
 // This class is used for user-defined counters.
@@ -351,24 +601,27 @@ class Counter {
     kDefaults = 0,
     // Mark the counter as a rate. It will be presented divided
     // by the duration of the benchmark.
-    kIsRate = 1U << 0U,
+    kIsRate = 1 << 0,
     // Mark the counter as a thread-average quantity. It will be
     // presented divided by the number of threads.
-    kAvgThreads = 1U << 1U,
+    kAvgThreads = 1 << 1,
     // Mark the counter as a thread-average rate. See above.
     kAvgThreadsRate = kIsRate | kAvgThreads,
     // Mark the counter as a constant value, valid/same for *every* iteration.
     // When reporting, it will be *multiplied* by the iteration count.
-    kIsIterationInvariant = 1U << 2U,
+    kIsIterationInvariant = 1 << 2,
     // Mark the counter as a constant rate.
     // When reporting, it will be *multiplied* by the iteration count
     // and then divided by the duration of the benchmark.
     kIsIterationInvariantRate = kIsRate | kIsIterationInvariant,
     // Mark the counter as a iteration-average quantity.
     // It will be presented divided by the number of iterations.
-    kAvgIterations = 1U << 3U,
+    kAvgIterations = 1 << 3,
     // Mark the counter as a iteration-average rate. See above.
-    kAvgIterationsRate = kIsRate | kAvgIterations
+    kAvgIterationsRate = kIsRate | kAvgIterations,
+
+    // In the end, invert the result. This is always done last!
+    kInvert = 1 << 31
   };
 
   enum OneK {
@@ -386,7 +639,7 @@ class Counter {
   Counter(double v = 0., Flags f = kDefaults, OneK k = kIs1000)
       : value(v), flags(f), oneK(k) {}
 
-  BENCHMARK_ALWAYS_INLINE operator double const&() const { return value; }
+  BENCHMARK_ALWAYS_INLINE operator double const &() const { return value; }
   BENCHMARK_ALWAYS_INLINE operator double&() { return value; }
 };
 
@@ -401,17 +654,15 @@ Counter::Flags inline operator|(const Counter::Flags& LHS,
 // This is the container for the user-defined counters.
 typedef std::map<std::string, Counter> UserCounters;
 
-// TimeUnit is passed to a benchmark in order to specify the order of magnitude
-// for the measured time.
-enum TimeUnit { kNanosecond, kMicrosecond, kMillisecond };
-
 // BigO is passed to a benchmark in order to specify the asymptotic
 // computational
 // complexity for the benchmark. In case oAuto is selected, complexity will be
 // calculated automatically to the best fit.
 enum BigO { oNone, o1, oN, oNSquared, oNCubed, oLogN, oNLogN, oAuto, oLambda };
 
-typedef uint64_t IterationCount;
+typedef int64_t IterationCount;
+
+enum StatisticUnit { kTime, kPercentage };
 
 // BigOFunc is passed to a benchmark in order to specify the asymptotic
 // computational complexity for the benchmark.
@@ -425,14 +676,17 @@ namespace internal {
 struct Statistics {
   std::string name_;
   StatisticsFunc* compute_;
+  StatisticUnit unit_;
 
-  Statistics(const std::string& name, StatisticsFunc* compute)
-      : name_(name), compute_(compute) {}
+  Statistics(const std::string& name, StatisticsFunc* compute,
+             StatisticUnit unit = kTime)
+      : name_(name), compute_(compute), unit_(unit) {}
 };
 
-struct BenchmarkInstance;
+class BenchmarkInstance;
 class ThreadTimer;
 class ThreadManager;
+class PerfCountersMeasurement;
 
 enum AggregationReportMode
 #if defined(BENCHMARK_HAS_CXX11)
@@ -454,11 +708,21 @@ enum AggregationReportMode
       ARM_FileReportAggregatesOnly | ARM_DisplayReportAggregatesOnly
 };
 
+enum Skipped
+#if defined(BENCHMARK_HAS_CXX11)
+    : unsigned
+#endif
+{
+  NotSkipped = 0,
+  SkippedWithMessage,
+  SkippedWithError
+};
+
 }  // namespace internal
 
 // State is passed to a running Benchmark and contains state for the
 // benchmark to use.
-class State {
+class BENCHMARK_EXPORT State {
  public:
   struct StateIterator;
   friend struct StateIterator;
@@ -490,8 +754,8 @@ class State {
   //   }
   bool KeepRunningBatch(IterationCount n);
 
-  // REQUIRES: timer is running and 'SkipWithError(...)' has not been called
-  //           by the current thread.
+  // REQUIRES: timer is running and 'SkipWithMessage(...)' or
+  //   'SkipWithError(...)' has not been called by the current thread.
   // Stop the benchmark timer.  If not called, the timer will be
   // automatically stopped after the last iteration of the benchmark loop.
   //
@@ -506,8 +770,8 @@ class State {
   // within each benchmark iteration, if possible.
   void PauseTiming();
 
-  // REQUIRES: timer is not running and 'SkipWithError(...)' has not been called
-  //           by the current thread.
+  // REQUIRES: timer is not running and 'SkipWithMessage(...)' or
+  //   'SkipWithError(...)' has not been called by the current thread.
   // Start the benchmark timer.  The timer is NOT running on entrance to the
   // benchmark function. It begins running after control flow enters the
   // benchmark loop.
@@ -517,8 +781,30 @@ class State {
   // within each benchmark iteration, if possible.
   void ResumeTiming();
 
-  // REQUIRES: 'SkipWithError(...)' has not been called previously by the
-  //            current thread.
+  // REQUIRES: 'SkipWithMessage(...)' or 'SkipWithError(...)' has not been
+  //            called previously by the current thread.
+  // Report the benchmark as resulting in being skipped with the specified
+  // 'msg'.
+  // After this call the user may explicitly 'return' from the benchmark.
+  //
+  // If the ranged-for style of benchmark loop is used, the user must explicitly
+  // break from the loop, otherwise all future iterations will be run.
+  // If the 'KeepRunning()' loop is used the current thread will automatically
+  // exit the loop at the end of the current iteration.
+  //
+  // For threaded benchmarks only the current thread stops executing and future
+  // calls to `KeepRunning()` will block until all threads have completed
+  // the `KeepRunning()` loop. If multiple threads report being skipped only the
+  // first skip message is used.
+  //
+  // NOTE: Calling 'SkipWithMessage(...)' does not cause the benchmark to exit
+  // the current scope immediately. If the function is called from within
+  // the 'KeepRunning()' loop the current iteration will finish. It is the users
+  // responsibility to exit the scope as needed.
+  void SkipWithMessage(const std::string& msg);
+
+  // REQUIRES: 'SkipWithMessage(...)' or 'SkipWithError(...)' has not been
+  //            called previously by the current thread.
   // Report the benchmark as resulting in an error with the specified 'msg'.
   // After this call the user may explicitly 'return' from the benchmark.
   //
@@ -536,7 +822,13 @@ class State {
   // the current scope immediately. If the function is called from within
   // the 'KeepRunning()' loop the current iteration will finish. It is the users
   // responsibility to exit the scope as needed.
-  void SkipWithError(const char* msg);
+  void SkipWithError(const std::string& msg);
+
+  // Returns true if 'SkipWithMessage(...)' or 'SkipWithError(...)' was called.
+  bool skipped() const { return internal::NotSkipped != skipped_; }
+
+  // Returns true if an error has been reported with 'SkipWithError(...)'.
+  bool error_occurred() const { return internal::SkippedWithError == skipped_; }
 
   // REQUIRES: called exactly once per iteration of the benchmarking loop.
   // Set the manually measured time for this benchmark iteration, which
@@ -574,7 +866,7 @@ class State {
   void SetComplexityN(int64_t complexity_n) { complexity_n_ = complexity_n; }
 
   BENCHMARK_ALWAYS_INLINE
-  int64_t complexity_length_n() { return complexity_n_; }
+  int64_t complexity_length_n() const { return complexity_n_; }
 
   // If this routine is called with items > 0, then an items/s
   // label is printed on the benchmark report line for the currently
@@ -607,11 +899,7 @@ class State {
   //  BM_Compress   50         50   14115038  compress:27.3%
   //
   // REQUIRES: a benchmark has exited its benchmarking loop.
-  void SetLabel(const char* label);
-
-  void BENCHMARK_ALWAYS_INLINE SetLabel(const std::string& str) {
-    this->SetLabel(str.c_str());
-  }
+  void SetLabel(const std::string& label);
 
   // Range arguments for this run. CHECKs if the argument has been set.
   BENCHMARK_ALWAYS_INLINE
@@ -626,6 +914,14 @@ class State {
   BENCHMARK_DEPRECATED_MSG("use 'range(1)' instead")
   int64_t range_y() const { return range(1); }
 
+  // Number of threads concurrently executing the benchmark.
+  BENCHMARK_ALWAYS_INLINE
+  int threads() const { return threads_; }
+
+  // Index of the executing thread. Values from [0, threads).
+  BENCHMARK_ALWAYS_INLINE
+  int thread_index() const { return thread_index_; }
+
   BENCHMARK_ALWAYS_INLINE
   IterationCount iterations() const {
     if (BENCHMARK_BUILTIN_EXPECT(!started_, false)) {
@@ -634,8 +930,11 @@ class State {
     return max_iterations - total_iterations_ + batch_leftover_;
   }
 
- private
-     :  // items we expect on the first cache line (ie 64 bytes of the struct)
+  BENCHMARK_ALWAYS_INLINE
+  std::string name() const { return name_; }
+
+ private:
+  // items we expect on the first cache line (ie 64 bytes of the struct)
   // When total_iterations_ is 0, KeepRunning() and friends will return false.
   // May be larger than max_iterations.
   IterationCount total_iterations_;
@@ -651,9 +950,9 @@ class State {
  private:
   bool started_;
   bool finished_;
-  bool error_occurred_;
+  internal::Skipped skipped_;
 
- private:  // items we don't need on the first cache line
+  // items we don't need on the first cache line
   std::vector<int64_t> range_;
 
   int64_t complexity_n_;
@@ -661,25 +960,28 @@ class State {
  public:
   // Container for user-defined counters.
   UserCounters counters;
-  // Index of the executing thread. Values from [0, threads).
-  const int thread_index;
-  // Number of threads concurrently executing the benchmark.
-  const int threads;
 
  private:
-  State(IterationCount max_iters, const std::vector<int64_t>& ranges,
-        int thread_i, int n_threads, internal::ThreadTimer* timer,
-        internal::ThreadManager* manager);
+  State(std::string name, IterationCount max_iters,
+        const std::vector<int64_t>& ranges, int thread_i, int n_threads,
+        internal::ThreadTimer* timer, internal::ThreadManager* manager,
+        internal::PerfCountersMeasurement* perf_counters_measurement);
 
   void StartKeepRunning();
   // Implementation of KeepRunning() and KeepRunningBatch().
   // is_batch must be true unless n is 1.
   bool KeepRunningInternal(IterationCount n, bool is_batch);
   void FinishKeepRunning();
-  internal::ThreadTimer* timer_;
-  internal::ThreadManager* manager_;
 
-  friend struct internal::BenchmarkInstance;
+  const std::string name_;
+  const int thread_index_;
+  const int threads_;
+
+  internal::ThreadTimer* const timer_;
+  internal::ThreadManager* const manager_;
+  internal::PerfCountersMeasurement* const perf_counters_measurement_;
+
+  friend class internal::BenchmarkInstance;
 };
 
 inline BENCHMARK_ALWAYS_INLINE bool State::KeepRunning() {
@@ -703,7 +1005,7 @@ inline BENCHMARK_ALWAYS_INLINE bool State::KeepRunningInternal(IterationCount n,
   }
   if (!started_) {
     StartKeepRunning();
-    if (!error_occurred_ && total_iterations_ >= n) {
+    if (!skipped() && total_iterations_ >= n) {
       total_iterations_ -= n;
       return true;
     }
@@ -733,7 +1035,7 @@ struct State::StateIterator {
 
   BENCHMARK_ALWAYS_INLINE
   explicit StateIterator(State* st)
-      : cached_(st->error_occurred_ ? 0 : st->max_iterations), parent_(st) {}
+      : cached_(st->skipped() ? 0 : st->max_iterations), parent_(st) {}
 
  public:
   BENCHMARK_ALWAYS_INLINE
@@ -776,13 +1078,16 @@ typedef void(Function)(State&);
 // be called on this object to change the properties of the benchmark.
 // Each method returns "this" so that multiple method calls can
 // chained into one expression.
-class Benchmark {
+class BENCHMARK_EXPORT Benchmark {
  public:
   virtual ~Benchmark();
 
   // Note: the following methods all return "this" so that multiple
   // method calls can be chained together in one expression.
 
+  // Specify the name of the benchmark
+  Benchmark* Name(const std::string& name);
+
   // Run this benchmark once with "x" as the extra argument passed
   // to the function.
   // REQUIRES: The function passed to the constructor must accept an arg1.
@@ -821,6 +1126,11 @@ class Benchmark {
   // REQUIRES: The function passed to the constructor must accept arg1, arg2 ...
   Benchmark* Ranges(const std::vector<std::pair<int64_t, int64_t> >& ranges);
 
+  // Run this benchmark once for each combination of values in the (cartesian)
+  // product of the supplied argument lists.
+  // REQUIRES: The function passed to the constructor must accept arg1, arg2 ...
+  Benchmark* ArgsProduct(const std::vector<std::vector<int64_t> >& arglists);
+
   // Equivalent to ArgNames({name})
   Benchmark* ArgName(const std::string& name);
 
@@ -838,6 +1148,23 @@ class Benchmark {
     return Ranges(ranges);
   }
 
+  // Have "setup" and/or "teardown" invoked once for every benchmark run.
+  // If the benchmark is multi-threaded (will run in k threads concurrently),
+  // the setup callback will be be invoked exactly once (not k times) before
+  // each run with k threads. Time allowing (e.g. for a short benchmark), there
+  // may be multiple such runs per benchmark, each run with its own
+  // "setup"/"teardown".
+  //
+  // If the benchmark uses different size groups of threads (e.g. via
+  // ThreadRange), the above will be true for each size group.
+  //
+  // The callback will be passed a State object, which includes the number
+  // of threads, thread-index, benchmark arguments, etc.
+  //
+  // The callback must not be NULL or self-deleting.
+  Benchmark* Setup(void (*setup)(const benchmark::State&));
+  Benchmark* Teardown(void (*teardown)(const benchmark::State&));
+
   // Pass this benchmark object to *func, which can customize
   // the benchmark by calling various methods like Arg, Args,
   // Threads, etc.
@@ -852,12 +1179,19 @@ class Benchmark {
   // REQUIRES: `t > 0` and `Iterations` has not been called on this benchmark.
   Benchmark* MinTime(double t);
 
+  // Set the minimum amount of time to run the benchmark before taking runtimes
+  // of this benchmark into account. This
+  // option overrides the `benchmark_min_warmup_time` flag.
+  // REQUIRES: `t >= 0` and `Iterations` has not been called on this benchmark.
+  Benchmark* MinWarmUpTime(double t);
+
   // Specify the amount of iterations that should be run by this benchmark.
+  // This option overrides the `benchmark_min_time` flag.
   // REQUIRES: 'n > 0' and `MinTime` has not been called on this benchmark.
   //
   // NOTE: This function should only be used when *exact* iteration control is
   //   needed and never to control or limit how long a benchmark runs, where
-  // `--benchmark_min_time=N` or `MinTime(...)` should be used instead.
+  // `--benchmark_min_time=<N>s` or `MinTime(...)` should be used instead.
   Benchmark* Iterations(IterationCount n);
 
   // Specify the amount of times to repeat this benchmark. This option overrides
@@ -877,7 +1211,7 @@ class Benchmark {
   // By default, the CPU time is measured only for the main thread, which may
   // be unrepresentative if the benchmark uses threads internally. If called,
   // the total CPU time spent by all the threads will be measured instead.
-  // By default, the only the main thread CPU time will be measured.
+  // By default, only the main thread CPU time will be measured.
   Benchmark* MeasureProcessCPUTime();
 
   // If a particular benchmark should use the Wall clock instead of the CPU time
@@ -906,7 +1240,9 @@ class Benchmark {
   Benchmark* Complexity(BigOFunc* complexity);
 
   // Add this statistics to be computed over all the values of benchmark run
-  Benchmark* ComputeStatistics(std::string name, StatisticsFunc* statistics);
+  Benchmark* ComputeStatistics(const std::string& name,
+                               StatisticsFunc* statistics,
+                               StatisticUnit unit = kTime);
 
   // Support for running multiple copies of the same benchmark concurrently
   // in multiple threads.  This may be useful when measuring the scaling
@@ -940,23 +1276,32 @@ class Benchmark {
 
   virtual void Run(State& state) = 0;
 
+  TimeUnit GetTimeUnit() const;
+
  protected:
-  explicit Benchmark(const char* name);
-  Benchmark(Benchmark const&);
-  void SetName(const char* name);
+  explicit Benchmark(const std::string& name);
+  void SetName(const std::string& name);
 
+ public:
+  const char* GetName() const;
   int ArgsCnt() const;
+  const char* GetArgName(int arg) const;
 
  private:
   friend class BenchmarkFamilies;
+  friend class BenchmarkInstance;
 
   std::string name_;
   AggregationReportMode aggregation_report_mode_;
   std::vector<std::string> arg_names_;       // Args for all benchmark runs
   std::vector<std::vector<int64_t> > args_;  // Args for all benchmark runs
+
   TimeUnit time_unit_;
+  bool use_default_time_unit_;
+
   int range_multiplier_;
   double min_time_;
+  double min_warmup_time_;
   IterationCount iterations_;
   int repetitions_;
   bool measure_process_cpu_time_;
@@ -967,7 +1312,21 @@ class Benchmark {
   std::vector<Statistics> statistics_;
   std::vector<int> thread_counts_;
 
-  Benchmark& operator=(Benchmark const&);
+  typedef void (*callback_function)(const benchmark::State&);
+  callback_function setup_;
+  callback_function teardown_;
+
+  Benchmark(Benchmark const&)
+#if defined(BENCHMARK_HAS_CXX11)
+      = delete
+#endif
+      ;
+
+  Benchmark& operator=(Benchmark const&)
+#if defined(BENCHMARK_HAS_CXX11)
+      = delete
+#endif
+      ;
 };
 
 }  // namespace internal
@@ -976,27 +1335,27 @@ class Benchmark {
 // the specified functor 'fn'.
 //
 // RETURNS: A pointer to the registered benchmark.
-internal::Benchmark* RegisterBenchmark(const char* name,
+internal::Benchmark* RegisterBenchmark(const std::string& name,
                                        internal::Function* fn);
 
 #if defined(BENCHMARK_HAS_CXX11)
 template <class Lambda>
-internal::Benchmark* RegisterBenchmark(const char* name, Lambda&& fn);
+internal::Benchmark* RegisterBenchmark(const std::string& name, Lambda&& fn);
 #endif
 
 // Remove all registered benchmarks. All pointers to previously registered
 // benchmarks are invalidated.
-void ClearRegisteredBenchmarks();
+BENCHMARK_EXPORT void ClearRegisteredBenchmarks();
 
 namespace internal {
 // The class used to hold all Benchmarks created from static function.
 // (ie those created using the BENCHMARK(...) macros.
-class FunctionBenchmark : public Benchmark {
+class BENCHMARK_EXPORT FunctionBenchmark : public Benchmark {
  public:
-  FunctionBenchmark(const char* name, Function* func)
+  FunctionBenchmark(const std::string& name, Function* func)
       : Benchmark(name), func_(func) {}
 
-  virtual void Run(State& st);
+  void Run(State& st) BENCHMARK_OVERRIDE;
 
  private:
   Function* func_;
@@ -1006,36 +1365,38 @@ class FunctionBenchmark : public Benchmark {
 template <class Lambda>
 class LambdaBenchmark : public Benchmark {
  public:
-  virtual void Run(State& st) { lambda_(st); }
+  void Run(State& st) BENCHMARK_OVERRIDE { lambda_(st); }
 
  private:
   template <class OLambda>
-  LambdaBenchmark(const char* name, OLambda&& lam)
+  LambdaBenchmark(const std::string& name, OLambda&& lam)
       : Benchmark(name), lambda_(std::forward<OLambda>(lam)) {}
 
   LambdaBenchmark(LambdaBenchmark const&) = delete;
 
- private:
-  template <class Lam>
-  friend Benchmark* ::benchmark::RegisterBenchmark(const char*, Lam&&);
+  template <class Lam>  // NOLINTNEXTLINE(readability-redundant-declaration)
+  friend Benchmark* ::benchmark::RegisterBenchmark(const std::string&, Lam&&);
 
   Lambda lambda_;
 };
 #endif
-
 }  // namespace internal
 
-inline internal::Benchmark* RegisterBenchmark(const char* name,
+inline internal::Benchmark* RegisterBenchmark(const std::string& name,
                                               internal::Function* fn) {
+  // FIXME: this should be a `std::make_unique<>()` but we don't have C++14.
+  // codechecker_intentional [cplusplus.NewDeleteLeaks]
   return internal::RegisterBenchmarkInternal(
       ::new internal::FunctionBenchmark(name, fn));
 }
 
 #ifdef BENCHMARK_HAS_CXX11
 template <class Lambda>
-internal::Benchmark* RegisterBenchmark(const char* name, Lambda&& fn) {
+internal::Benchmark* RegisterBenchmark(const std::string& name, Lambda&& fn) {
   using BenchType =
       internal::LambdaBenchmark<typename std::decay<Lambda>::type>;
+  // FIXME: this should be a `std::make_unique<>()` but we don't have C++14.
+  // codechecker_intentional [cplusplus.NewDeleteLeaks]
   return internal::RegisterBenchmarkInternal(
       ::new BenchType(name, std::forward<Lambda>(fn)));
 }
@@ -1044,7 +1405,7 @@ internal::Benchmark* RegisterBenchmark(const char* name, Lambda&& fn) {
 #if defined(BENCHMARK_HAS_CXX11) && \
     (!defined(BENCHMARK_GCC_VERSION) || BENCHMARK_GCC_VERSION >= 409)
 template <class Lambda, class... Args>
-internal::Benchmark* RegisterBenchmark(const char* name, Lambda&& fn,
+internal::Benchmark* RegisterBenchmark(const std::string& name, Lambda&& fn,
                                        Args&&... args) {
   return benchmark::RegisterBenchmark(
       name, [=](benchmark::State& st) { fn(st, args...); });
@@ -1058,7 +1419,7 @@ class Fixture : public internal::Benchmark {
  public:
   Fixture() : internal::Benchmark("") {}
 
-  virtual void Run(State& st) {
+  void Run(State& st) BENCHMARK_OVERRIDE {
     this->SetUp(st);
     this->BenchmarkCase(st);
     this->TearDown(st);
@@ -1074,7 +1435,6 @@ class Fixture : public internal::Benchmark {
  protected:
   virtual void BenchmarkCase(State&) = 0;
 };
-
 }  // namespace benchmark
 
 // ------------------------------------------------------
@@ -1090,19 +1450,37 @@ class Fixture : public internal::Benchmark {
 #endif
 
 // Helpers for generating unique variable names
+#ifdef BENCHMARK_HAS_CXX11
+#define BENCHMARK_PRIVATE_NAME(...)                                      \
+  BENCHMARK_PRIVATE_CONCAT(benchmark_uniq_, BENCHMARK_PRIVATE_UNIQUE_ID, \
+                           __VA_ARGS__)
+#else
 #define BENCHMARK_PRIVATE_NAME(n) \
-  BENCHMARK_PRIVATE_CONCAT(_benchmark_, BENCHMARK_PRIVATE_UNIQUE_ID, n)
+  BENCHMARK_PRIVATE_CONCAT(benchmark_uniq_, BENCHMARK_PRIVATE_UNIQUE_ID, n)
+#endif  // BENCHMARK_HAS_CXX11
+
 #define BENCHMARK_PRIVATE_CONCAT(a, b, c) BENCHMARK_PRIVATE_CONCAT2(a, b, c)
 #define BENCHMARK_PRIVATE_CONCAT2(a, b, c) a##b##c
+// Helper for concatenation with macro name expansion
+#define BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method) \
+  BaseClass##_##Method##_Benchmark
 
 #define BENCHMARK_PRIVATE_DECLARE(n)                                 \
   static ::benchmark::internal::Benchmark* BENCHMARK_PRIVATE_NAME(n) \
       BENCHMARK_UNUSED
 
+#ifdef BENCHMARK_HAS_CXX11
+#define BENCHMARK(...)                                               \
+  BENCHMARK_PRIVATE_DECLARE(_benchmark_) =                           \
+      (::benchmark::internal::RegisterBenchmarkInternal(             \
+          new ::benchmark::internal::FunctionBenchmark(#__VA_ARGS__, \
+                                                       __VA_ARGS__)))
+#else
 #define BENCHMARK(n)                                     \
   BENCHMARK_PRIVATE_DECLARE(n) =                         \
       (::benchmark::internal::RegisterBenchmarkInternal( \
           new ::benchmark::internal::FunctionBenchmark(#n, n)))
+#endif  // BENCHMARK_HAS_CXX11
 
 // Old-style macros
 #define BENCHMARK_WITH_ARG(n, a) BENCHMARK(n)->Arg((a))
@@ -1163,49 +1541,49 @@ class Fixture : public internal::Benchmark {
 #define BENCHMARK_TEMPLATE(n, a) BENCHMARK_TEMPLATE1(n, a)
 #endif
 
-#define BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method)        \
-  class BaseClass##_##Method##_Benchmark : public BaseClass { \
-   public:                                                    \
-    BaseClass##_##Method##_Benchmark() : BaseClass() {        \
-      this->SetName(#BaseClass "/" #Method);                  \
-    }                                                         \
-                                                              \
-   protected:                                                 \
-    virtual void BenchmarkCase(::benchmark::State&);          \
+#define BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method)          \
+  class BaseClass##_##Method##_Benchmark : public BaseClass {   \
+   public:                                                      \
+    BaseClass##_##Method##_Benchmark() {                        \
+      this->SetName(#BaseClass "/" #Method);                    \
+    }                                                           \
+                                                                \
+   protected:                                                   \
+    void BenchmarkCase(::benchmark::State&) BENCHMARK_OVERRIDE; \
   };
 
 #define BENCHMARK_TEMPLATE1_PRIVATE_DECLARE_F(BaseClass, Method, a) \
   class BaseClass##_##Method##_Benchmark : public BaseClass<a> {    \
    public:                                                          \
-    BaseClass##_##Method##_Benchmark() : BaseClass<a>() {           \
+    BaseClass##_##Method##_Benchmark() {                            \
       this->SetName(#BaseClass "<" #a ">/" #Method);                \
     }                                                               \
                                                                     \
    protected:                                                       \
-    virtual void BenchmarkCase(::benchmark::State&);                \
+    void BenchmarkCase(::benchmark::State&) BENCHMARK_OVERRIDE;     \
   };
 
 #define BENCHMARK_TEMPLATE2_PRIVATE_DECLARE_F(BaseClass, Method, a, b) \
   class BaseClass##_##Method##_Benchmark : public BaseClass<a, b> {    \
    public:                                                             \
-    BaseClass##_##Method##_Benchmark() : BaseClass<a, b>() {           \
+    BaseClass##_##Method##_Benchmark() {                               \
       this->SetName(#BaseClass "<" #a "," #b ">/" #Method);            \
     }                                                                  \
                                                                        \
    protected:                                                          \
-    virtual void BenchmarkCase(::benchmark::State&);                   \
+    void BenchmarkCase(::benchmark::State&) BENCHMARK_OVERRIDE;        \
   };
 
 #ifdef BENCHMARK_HAS_CXX11
 #define BENCHMARK_TEMPLATE_PRIVATE_DECLARE_F(BaseClass, Method, ...)       \
   class BaseClass##_##Method##_Benchmark : public BaseClass<__VA_ARGS__> { \
    public:                                                                 \
-    BaseClass##_##Method##_Benchmark() : BaseClass<__VA_ARGS__>() {        \
+    BaseClass##_##Method##_Benchmark() {                                   \
       this->SetName(#BaseClass "<" #__VA_ARGS__ ">/" #Method);             \
     }                                                                      \
                                                                            \
    protected:                                                              \
-    virtual void BenchmarkCase(::benchmark::State&);                       \
+    void BenchmarkCase(::benchmark::State&) BENCHMARK_OVERRIDE;            \
   };
 #else
 #define BENCHMARK_TEMPLATE_PRIVATE_DECLARE_F(n, a) \
@@ -1214,27 +1592,27 @@ class Fixture : public internal::Benchmark {
 
 #define BENCHMARK_DEFINE_F(BaseClass, Method)    \
   BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method) \
-  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
 
 #define BENCHMARK_TEMPLATE1_DEFINE_F(BaseClass, Method, a)    \
   BENCHMARK_TEMPLATE1_PRIVATE_DECLARE_F(BaseClass, Method, a) \
-  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
 
 #define BENCHMARK_TEMPLATE2_DEFINE_F(BaseClass, Method, a, b)    \
   BENCHMARK_TEMPLATE2_PRIVATE_DECLARE_F(BaseClass, Method, a, b) \
-  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
 
 #ifdef BENCHMARK_HAS_CXX11
 #define BENCHMARK_TEMPLATE_DEFINE_F(BaseClass, Method, ...)            \
   BENCHMARK_TEMPLATE_PRIVATE_DECLARE_F(BaseClass, Method, __VA_ARGS__) \
-  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
 #else
 #define BENCHMARK_TEMPLATE_DEFINE_F(BaseClass, Method, a) \
   BENCHMARK_TEMPLATE1_DEFINE_F(BaseClass, Method, a)
 #endif
 
 #define BENCHMARK_REGISTER_F(BaseClass, Method) \
-  BENCHMARK_PRIVATE_REGISTER_F(BaseClass##_##Method##_Benchmark)
+  BENCHMARK_PRIVATE_REGISTER_F(BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method))
 
 #define BENCHMARK_PRIVATE_REGISTER_F(TestName) \
   BENCHMARK_PRIVATE_DECLARE(TestName) =        \
@@ -1244,34 +1622,43 @@ class Fixture : public internal::Benchmark {
 #define BENCHMARK_F(BaseClass, Method)           \
   BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method) \
   BENCHMARK_REGISTER_F(BaseClass, Method);       \
-  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
 
 #define BENCHMARK_TEMPLATE1_F(BaseClass, Method, a)           \
   BENCHMARK_TEMPLATE1_PRIVATE_DECLARE_F(BaseClass, Method, a) \
   BENCHMARK_REGISTER_F(BaseClass, Method);                    \
-  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
 
 #define BENCHMARK_TEMPLATE2_F(BaseClass, Method, a, b)           \
   BENCHMARK_TEMPLATE2_PRIVATE_DECLARE_F(BaseClass, Method, a, b) \
   BENCHMARK_REGISTER_F(BaseClass, Method);                       \
-  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
 
 #ifdef BENCHMARK_HAS_CXX11
 #define BENCHMARK_TEMPLATE_F(BaseClass, Method, ...)                   \
   BENCHMARK_TEMPLATE_PRIVATE_DECLARE_F(BaseClass, Method, __VA_ARGS__) \
   BENCHMARK_REGISTER_F(BaseClass, Method);                             \
-  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+  void BENCHMARK_PRIVATE_CONCAT_NAME(BaseClass, Method)::BenchmarkCase
 #else
 #define BENCHMARK_TEMPLATE_F(BaseClass, Method, a) \
   BENCHMARK_TEMPLATE1_F(BaseClass, Method, a)
 #endif
 
 // Helper macro to create a main routine in a test that runs the benchmarks
+// Note the workaround for Hexagon simulator passing argc != 0, argv = NULL.
 #define BENCHMARK_MAIN()                                                \
   int main(int argc, char** argv) {                                     \
+    char arg0_default[] = "benchmark";                                  \
+    char* args_default = arg0_default;                                  \
+    if (!argv) {                                                        \
+      argc = 1;                                                         \
+      argv = &args_default;                                             \
+    }                                                                   \
     ::benchmark::Initialize(&argc, argv);                               \
     if (::benchmark::ReportUnrecognizedArguments(argc, argv)) return 1; \
     ::benchmark::RunSpecifiedBenchmarks();                              \
+    ::benchmark::Shutdown();                                            \
+    return 0;                                                           \
   }                                                                     \
   int main(int, char**)
 
@@ -1280,7 +1667,7 @@ class Fixture : public internal::Benchmark {
 
 namespace benchmark {
 
-struct CPUInfo {
+struct BENCHMARK_EXPORT CPUInfo {
   struct CacheInfo {
     std::string type;
     int level;
@@ -1288,10 +1675,12 @@ struct CPUInfo {
     int num_sharing;
   };
 
+  enum Scaling { UNKNOWN, ENABLED, DISABLED };
+
   int num_cpus;
+  Scaling scaling;
   double cycles_per_second;
   std::vector<CacheInfo> caches;
-  bool scaling_enabled;
   std::vector<double> load_avg;
 
   static const CPUInfo& Get();
@@ -1302,7 +1691,7 @@ struct CPUInfo {
 };
 
 // Adding Struct for System Information
-struct SystemInfo {
+struct BENCHMARK_EXPORT SystemInfo {
   std::string name;
   static const SystemInfo& Get();
 
@@ -1314,10 +1703,11 @@ struct SystemInfo {
 // BenchmarkName contains the components of the Benchmark's name
 // which allows individual fields to be modified or cleared before
 // building the final name using 'str()'.
-struct BenchmarkName {
+struct BENCHMARK_EXPORT BenchmarkName {
   std::string function_name;
   std::string args;
   std::string min_time;
+  std::string min_warmup_time;
   std::string iterations;
   std::string repetitions;
   std::string time_type;
@@ -1333,7 +1723,7 @@ struct BenchmarkName {
 // can control the destination of the reports by calling
 // RunSpecifiedBenchmarks and passing it a custom reporter object.
 // The reporter object must implement the following interface.
-class BenchmarkReporter {
+class BENCHMARK_EXPORT BenchmarkReporter {
  public:
   struct Context {
     CPUInfo const& cpu_info;
@@ -1344,16 +1734,17 @@ class BenchmarkReporter {
     Context();
   };
 
-  struct Run {
+  struct BENCHMARK_EXPORT Run {
     static const int64_t no_repetition_index = -1;
     enum RunType { RT_Iteration, RT_Aggregate };
 
     Run()
         : run_type(RT_Iteration),
-          error_occurred(false),
+          aggregate_unit(kTime),
+          skipped(internal::NotSkipped),
           iterations(1),
           threads(1),
-          time_unit(kNanosecond),
+          time_unit(GetDefaultTimeUnit()),
           real_accumulated_time(0),
           cpu_accumulated_time(0),
           max_heapbytes_used(0),
@@ -1362,18 +1753,19 @@ class BenchmarkReporter {
           complexity_n(0),
           report_big_o(false),
           report_rms(false),
-          counters(),
-          has_memory_result(false),
-          allocs_per_iter(0.0),
-          max_bytes_used(0) {}
+          memory_result(NULL),
+          allocs_per_iter(0.0) {}
 
     std::string benchmark_name() const;
     BenchmarkName run_name;
+    int64_t family_index;
+    int64_t per_family_instance_index;
     RunType run_type;
     std::string aggregate_name;
+    StatisticUnit aggregate_unit;
     std::string report_label;  // Empty if not set by benchmark.
-    bool error_occurred;
-    std::string error_message;
+    internal::Skipped skipped;
+    std::string skip_message;
 
     IterationCount iterations;
     int64_t threads;
@@ -1413,9 +1805,21 @@ class BenchmarkReporter {
     UserCounters counters;
 
     // Memory metrics.
-    bool has_memory_result;
+    const MemoryManager::Result* memory_result;
     double allocs_per_iter;
-    int64_t max_bytes_used;
+  };
+
+  struct PerFamilyRunReports {
+    PerFamilyRunReports() : num_runs_total(0), num_runs_done(0) {}
+
+    // How many runs will all instances of this benchmark perform?
+    int num_runs_total;
+
+    // How many runs have happened already?
+    int num_runs_done;
+
+    // The reports about (non-errneous!) runs of this family.
+    std::vector<BenchmarkReporter::Run> Runs;
   };
 
   // Construct a BenchmarkReporter with the output stream set to 'std::cout'
@@ -1430,6 +1834,12 @@ class BenchmarkReporter {
   // to skip runs based on the context information.
   virtual bool ReportContext(const Context& context) = 0;
 
+  // Called once for each group of benchmark runs, gives information about
+  // the configurations of the runs.
+  virtual void ReportRunsConfig(double /*min_time*/,
+                                bool /*has_explicit_iters*/,
+                                IterationCount /*iters*/) {}
+
   // Called once for each group of benchmark runs, gives information about
   // cpu-time and heap memory usage during the benchmark run. If the group
   // of runs contained more than two entries then 'report' contains additional
@@ -1475,7 +1885,7 @@ class BenchmarkReporter {
 
 // Simple reporter that outputs benchmark data to the console. This is the
 // default reporter used by RunSpecifiedBenchmarks().
-class ConsoleReporter : public BenchmarkReporter {
+class BENCHMARK_EXPORT ConsoleReporter : public BenchmarkReporter {
  public:
   enum OutputOptions {
     OO_None = 0,
@@ -1485,13 +1895,10 @@ class ConsoleReporter : public BenchmarkReporter {
     OO_Defaults = OO_ColorTabular
   };
   explicit ConsoleReporter(OutputOptions opts_ = OO_Defaults)
-      : output_options_(opts_),
-        name_field_width_(0),
-        prev_counters_(),
-        printed_header_(false) {}
+      : output_options_(opts_), name_field_width_(0), printed_header_(false) {}
 
-  virtual bool ReportContext(const Context& context);
-  virtual void ReportRuns(const std::vector<Run>& reports);
+  bool ReportContext(const Context& context) BENCHMARK_OVERRIDE;
+  void ReportRuns(const std::vector<Run>& reports) BENCHMARK_OVERRIDE;
 
  protected:
   virtual void PrintRunData(const Run& report);
@@ -1503,12 +1910,12 @@ class ConsoleReporter : public BenchmarkReporter {
   bool printed_header_;
 };
 
-class JSONReporter : public BenchmarkReporter {
+class BENCHMARK_EXPORT JSONReporter : public BenchmarkReporter {
  public:
   JSONReporter() : first_report_(true) {}
-  virtual bool ReportContext(const Context& context);
-  virtual void ReportRuns(const std::vector<Run>& reports);
-  virtual void Finalize();
+  bool ReportContext(const Context& context) BENCHMARK_OVERRIDE;
+  void ReportRuns(const std::vector<Run>& reports) BENCHMARK_OVERRIDE;
+  void Finalize() BENCHMARK_OVERRIDE;
 
  private:
   void PrintRunData(const Run& report);
@@ -1516,13 +1923,13 @@ class JSONReporter : public BenchmarkReporter {
   bool first_report_;
 };
 
-class BENCHMARK_DEPRECATED_MSG(
+class BENCHMARK_EXPORT BENCHMARK_DEPRECATED_MSG(
     "The CSV Reporter will be removed in a future release") CSVReporter
     : public BenchmarkReporter {
  public:
   CSVReporter() : printed_header_(false) {}
-  virtual bool ReportContext(const Context& context);
-  virtual void ReportRuns(const std::vector<Run>& reports);
+  bool ReportContext(const Context& context) BENCHMARK_OVERRIDE;
+  void ReportRuns(const std::vector<Run>& reports) BENCHMARK_OVERRIDE;
 
  private:
   void PrintRunData(const Run& report);
@@ -1531,31 +1938,10 @@ class BENCHMARK_DEPRECATED_MSG(
   std::set<std::string> user_counter_names_;
 };
 
-// If a MemoryManager is registered, it can be used to collect and report
-// allocation metrics for a run of the benchmark.
-class MemoryManager {
- public:
-  struct Result {
-    Result() : num_allocs(0), max_bytes_used(0) {}
-
-    // The number of allocations made in total between Start and Stop.
-    int64_t num_allocs;
-
-    // The peak memory use between Start and Stop.
-    int64_t max_bytes_used;
-  };
-
-  virtual ~MemoryManager() {}
-
-  // Implement this to start recording allocation information.
-  virtual void Start() = 0;
-
-  // Implement this to stop recording and fill out the given Result structure.
-  virtual void Stop(Result* result) = 0;
-};
-
 inline const char* GetTimeUnitString(TimeUnit unit) {
   switch (unit) {
+    case kSecond:
+      return "s";
     case kMillisecond:
       return "ms";
     case kMicrosecond:
@@ -1568,6 +1954,8 @@ inline const char* GetTimeUnitString(TimeUnit unit) {
 
 inline double GetTimeUnitMultiplier(TimeUnit unit) {
   switch (unit) {
+    case kSecond:
+      return 1;
     case kMillisecond:
       return 1e3;
     case kMicrosecond:
@@ -1578,6 +1966,26 @@ inline double GetTimeUnitMultiplier(TimeUnit unit) {
   BENCHMARK_UNREACHABLE();
 }
 
+// Creates a list of integer values for the given range and multiplier.
+// This can be used together with ArgsProduct() to allow multiple ranges
+// with different multipliers.
+// Example:
+// ArgsProduct({
+//   CreateRange(0, 1024, /*multi=*/32),
+//   CreateRange(0, 100, /*multi=*/4),
+//   CreateDenseRange(0, 4, /*step=*/1),
+// });
+BENCHMARK_EXPORT
+std::vector<int64_t> CreateRange(int64_t lo, int64_t hi, int multi);
+
+// Creates a list of integer values for the given range and step.
+BENCHMARK_EXPORT
+std::vector<int64_t> CreateDenseRange(int64_t start, int64_t limit, int step);
+
 }  // namespace benchmark
 
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
 #endif  // BENCHMARK_BENCHMARK_H_
diff --git a/ThirdParty/googlebenchmark/include/benchmark/export.h b/ThirdParty/googlebenchmark/include/benchmark/export.h
new file mode 100644
index 0000000000..f96f8596cd
--- /dev/null
+++ b/ThirdParty/googlebenchmark/include/benchmark/export.h
@@ -0,0 +1,47 @@
+#ifndef BENCHMARK_EXPORT_H
+#define BENCHMARK_EXPORT_H
+
+#if defined(_WIN32)
+#define EXPORT_ATTR __declspec(dllexport)
+#define IMPORT_ATTR __declspec(dllimport)
+#define NO_EXPORT_ATTR
+#define DEPRECATED_ATTR __declspec(deprecated)
+#else  // _WIN32
+#define EXPORT_ATTR __attribute__((visibility("default")))
+#define IMPORT_ATTR __attribute__((visibility("default")))
+#define NO_EXPORT_ATTR __attribute__((visibility("hidden")))
+#define DEPRECATE_ATTR __attribute__((__deprecated__))
+#endif  // _WIN32
+
+#ifdef BENCHMARK_STATIC_DEFINE
+#define BENCHMARK_EXPORT
+#define BENCHMARK_NO_EXPORT
+#else  // BENCHMARK_STATIC_DEFINE
+#ifndef BENCHMARK_EXPORT
+#ifdef benchmark_EXPORTS
+/* We are building this library */
+#define BENCHMARK_EXPORT EXPORT_ATTR
+#else  // benchmark_EXPORTS
+/* We are using this library */
+#define BENCHMARK_EXPORT IMPORT_ATTR
+#endif  // benchmark_EXPORTS
+#endif  // !BENCHMARK_EXPORT
+
+#ifndef BENCHMARK_NO_EXPORT
+#define BENCHMARK_NO_EXPORT NO_EXPORT_ATTR
+#endif  // !BENCHMARK_NO_EXPORT
+#endif  // BENCHMARK_STATIC_DEFINE
+
+#ifndef BENCHMARK_DEPRECATED
+#define BENCHMARK_DEPRECATED DEPRECATE_ATTR
+#endif  // BENCHMARK_DEPRECATED
+
+#ifndef BENCHMARK_DEPRECATED_EXPORT
+#define BENCHMARK_DEPRECATED_EXPORT BENCHMARK_EXPORT BENCHMARK_DEPRECATED
+#endif  // BENCHMARK_DEPRECATED_EXPORT
+
+#ifndef BENCHMARK_DEPRECATED_NO_EXPORT
+#define BENCHMARK_DEPRECATED_NO_EXPORT BENCHMARK_NO_EXPORT BENCHMARK_DEPRECATED
+#endif  // BENCHMARK_DEPRECATED_EXPORT
+
+#endif /* BENCHMARK_EXPORT_H */
diff --git a/ThirdParty/googlebenchmark/mingw.py b/ThirdParty/googlebenchmark/mingw.py
deleted file mode 100644
index 706ad559db..0000000000
--- a/ThirdParty/googlebenchmark/mingw.py
+++ /dev/null
@@ -1,320 +0,0 @@
-#! /usr/bin/env python
-# encoding: utf-8
-
-import argparse
-import errno
-import logging
-import os
-import platform
-import re
-import sys
-import subprocess
-import tempfile
-
-try:
-    import winreg
-except ImportError:
-    import _winreg as winreg
-try:
-    import urllib.request as request
-except ImportError:
-    import urllib as request
-try:
-    import urllib.parse as parse
-except ImportError:
-    import urlparse as parse
-
-class EmptyLogger(object):
-    '''
-    Provides an implementation that performs no logging
-    '''
-    def debug(self, *k, **kw):
-        pass
-    def info(self, *k, **kw):
-        pass
-    def warn(self, *k, **kw):
-        pass
-    def error(self, *k, **kw):
-        pass
-    def critical(self, *k, **kw):
-        pass
-    def setLevel(self, *k, **kw):
-        pass
-
-urls = (
-    'http://downloads.sourceforge.net/project/mingw-w64/Toolchains%20'
-        'targetting%20Win32/Personal%20Builds/mingw-builds/installer/'
-        'repository.txt',
-    'http://downloads.sourceforge.net/project/mingwbuilds/host-windows/'
-        'repository.txt'
-)
-'''
-A list of mingw-build repositories
-'''
-
-def repository(urls = urls, log = EmptyLogger()):
-    '''
-    Downloads and parse mingw-build repository files and parses them
-    '''
-    log.info('getting mingw-builds repository')
-    versions = {}
-    re_sourceforge = re.compile(r'http://sourceforge.net/projects/([^/]+)/files')
-    re_sub = r'http://downloads.sourceforge.net/project/\1'
-    for url in urls:
-        log.debug(' - requesting: %s', url)
-        socket = request.urlopen(url)
-        repo = socket.read()
-        if not isinstance(repo, str):
-            repo = repo.decode();
-        socket.close()
-        for entry in repo.split('\n')[:-1]:
-            value = entry.split('|')
-            version = tuple([int(n) for n in value[0].strip().split('.')])
-            version = versions.setdefault(version, {})
-            arch = value[1].strip()
-            if arch == 'x32':
-                arch = 'i686'
-            elif arch == 'x64':
-                arch = 'x86_64'
-            arch = version.setdefault(arch, {})
-            threading = arch.setdefault(value[2].strip(), {})
-            exceptions = threading.setdefault(value[3].strip(), {})
-            revision = exceptions.setdefault(int(value[4].strip()[3:]),
-                re_sourceforge.sub(re_sub, value[5].strip()))
-    return versions
-
-def find_in_path(file, path=None):
-    '''
-    Attempts to find an executable in the path
-    '''
-    if platform.system() == 'Windows':
-        file += '.exe'
-    if path is None:
-        path = os.environ.get('PATH', '')
-    if type(path) is type(''):
-        path = path.split(os.pathsep)
-    return list(filter(os.path.exists,
-        map(lambda dir, file=file: os.path.join(dir, file), path)))
-
-def find_7zip(log = EmptyLogger()):
-    '''
-    Attempts to find 7zip for unpacking the mingw-build archives
-    '''
-    log.info('finding 7zip')
-    path = find_in_path('7z')
-    if not path:
-        key = winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE, r'SOFTWARE\7-Zip')
-        path, _ = winreg.QueryValueEx(key, 'Path')
-        path = [os.path.join(path, '7z.exe')]
-    log.debug('found \'%s\'', path[0])
-    return path[0]
-
-find_7zip()
-
-def unpack(archive, location, log = EmptyLogger()):
-    '''
-    Unpacks a mingw-builds archive
-    '''
-    sevenzip = find_7zip(log)
-    log.info('unpacking %s', os.path.basename(archive))
-    cmd = [sevenzip, 'x', archive, '-o' + location, '-y']
-    log.debug(' - %r', cmd)
-    with open(os.devnull, 'w') as devnull:
-        subprocess.check_call(cmd, stdout = devnull)
-
-def download(url, location, log = EmptyLogger()):
-    '''
-    Downloads and unpacks a mingw-builds archive
-    '''
-    log.info('downloading MinGW')
-    log.debug(' - url: %s', url)
-    log.debug(' - location: %s', location)
-
-    re_content = re.compile(r'attachment;[ \t]*filename=(")?([^"]*)(")?[\r\n]*')
-
-    stream = request.urlopen(url)
-    try:
-        content = stream.getheader('Content-Disposition') or ''
-    except AttributeError:
-        content = stream.headers.getheader('Content-Disposition') or ''
-    matches = re_content.match(content)
-    if matches:
-        filename = matches.group(2)
-    else:
-        parsed = parse.urlparse(stream.geturl())
-        filename = os.path.basename(parsed.path)
-
-    try:
-        os.makedirs(location)
-    except OSError as e:
-        if e.errno == errno.EEXIST and os.path.isdir(location):
-            pass
-        else:
-            raise
-
-    archive = os.path.join(location, filename)
-    with open(archive, 'wb') as out:
-        while True:
-            buf = stream.read(1024)
-            if not buf:
-                break
-            out.write(buf)
-    unpack(archive, location, log = log)
-    os.remove(archive)
-
-    possible = os.path.join(location, 'mingw64')
-    if not os.path.exists(possible):
-        possible = os.path.join(location, 'mingw32')
-        if not os.path.exists(possible):
-            raise ValueError('Failed to find unpacked MinGW: ' + possible)
-    return possible
-
-def root(location = None, arch = None, version = None, threading = None,
-        exceptions = None, revision = None, log = EmptyLogger()):
-    '''
-    Returns the root folder of a specific version of the mingw-builds variant
-    of gcc. Will download the compiler if needed
-    '''
-
-    # Get the repository if we don't have all the information
-    if not (arch and version and threading and exceptions and revision):
-        versions = repository(log = log)
-
-    # Determine some defaults
-    version = version or max(versions.keys())
-    if not arch:
-        arch = platform.machine().lower()
-        if arch == 'x86':
-            arch = 'i686'
-        elif arch == 'amd64':
-            arch = 'x86_64'
-    if not threading:
-        keys = versions[version][arch].keys()
-        if 'posix' in keys:
-            threading = 'posix'
-        elif 'win32' in keys:
-            threading = 'win32'
-        else:
-            threading = keys[0]
-    if not exceptions:
-        keys = versions[version][arch][threading].keys()
-        if 'seh' in keys:
-            exceptions = 'seh'
-        elif 'sjlj' in keys:
-            exceptions = 'sjlj'
-        else:
-            exceptions = keys[0]
-    if revision == None:
-        revision = max(versions[version][arch][threading][exceptions].keys())
-    if not location:
-        location = os.path.join(tempfile.gettempdir(), 'mingw-builds')
-
-    # Get the download url
-    url = versions[version][arch][threading][exceptions][revision]
-
-    # Tell the user whatzzup
-    log.info('finding MinGW %s', '.'.join(str(v) for v in version))
-    log.debug(' - arch: %s', arch)
-    log.debug(' - threading: %s', threading)
-    log.debug(' - exceptions: %s', exceptions)
-    log.debug(' - revision: %s', revision)
-    log.debug(' - url: %s', url)
-
-    # Store each specific revision differently
-    slug = '{version}-{arch}-{threading}-{exceptions}-rev{revision}'
-    slug = slug.format(
-        version = '.'.join(str(v) for v in version),
-        arch = arch,
-        threading = threading,
-        exceptions = exceptions,
-        revision = revision
-    )
-    if arch == 'x86_64':
-        root_dir = os.path.join(location, slug, 'mingw64')
-    elif arch == 'i686':
-        root_dir = os.path.join(location, slug, 'mingw32')
-    else:
-        raise ValueError('Unknown MinGW arch: ' + arch)
-
-    # Download if needed
-    if not os.path.exists(root_dir):
-        downloaded = download(url, os.path.join(location, slug), log = log)
-        if downloaded != root_dir:
-            raise ValueError('The location of mingw did not match\n%s\n%s'
-                % (downloaded, root_dir))
-
-    return root_dir
-
-def str2ver(string):
-    '''
-    Converts a version string into a tuple
-    '''
-    try:
-        version = tuple(int(v) for v in string.split('.'))
-        if len(version) is not 3:
-            raise ValueError()
-    except ValueError:
-        raise argparse.ArgumentTypeError(
-            'please provide a three digit version string')
-    return version
-
-def main():
-    '''
-    Invoked when the script is run directly by the python interpreter
-    '''
-    parser = argparse.ArgumentParser(
-        description = 'Downloads a specific version of MinGW',
-        formatter_class = argparse.ArgumentDefaultsHelpFormatter
-    )
-    parser.add_argument('--location',
-        help = 'the location to download the compiler to',
-        default = os.path.join(tempfile.gettempdir(), 'mingw-builds'))
-    parser.add_argument('--arch', required = True, choices = ['i686', 'x86_64'],
-        help = 'the target MinGW architecture string')
-    parser.add_argument('--version', type = str2ver,
-        help = 'the version of GCC to download')
-    parser.add_argument('--threading', choices = ['posix', 'win32'],
-        help = 'the threading type of the compiler')
-    parser.add_argument('--exceptions', choices = ['sjlj', 'seh', 'dwarf'],
-        help = 'the method to throw exceptions')
-    parser.add_argument('--revision', type=int,
-        help = 'the revision of the MinGW release')
-    group = parser.add_mutually_exclusive_group()
-    group.add_argument('-v', '--verbose', action='store_true',
-        help='increase the script output verbosity')
-    group.add_argument('-q', '--quiet', action='store_true',
-        help='only print errors and warning')
-    args = parser.parse_args()
-
-    # Create the logger
-    logger = logging.getLogger('mingw')
-    handler = logging.StreamHandler()
-    formatter = logging.Formatter('%(message)s')
-    handler.setFormatter(formatter)
-    logger.addHandler(handler)
-    logger.setLevel(logging.INFO)
-    if args.quiet:
-        logger.setLevel(logging.WARN)
-    if args.verbose:
-        logger.setLevel(logging.DEBUG)
-
-    # Get MinGW
-    root_dir = root(location = args.location, arch = args.arch,
-        version = args.version, threading = args.threading,
-        exceptions = args.exceptions, revision = args.revision,
-        log = logger)
-
-    sys.stdout.write('%s\n' % os.path.join(root_dir, 'bin'))
-
-if __name__ == '__main__':
-    try:
-        main()
-    except IOError as e:
-        sys.stderr.write('IO error: %s\n' % e)
-        sys.exit(1)
-    except OSError as e:
-        sys.stderr.write('OS error: %s\n' % e)
-        sys.exit(1)
-    except KeyboardInterrupt as e:
-        sys.stderr.write('Killed\n')
-        sys.exit(1)
diff --git a/ThirdParty/googlebenchmark/releasing.md b/ThirdParty/googlebenchmark/releasing.md
deleted file mode 100644
index f0cd7010e3..0000000000
--- a/ThirdParty/googlebenchmark/releasing.md
+++ /dev/null
@@ -1,16 +0,0 @@
-# How to release
-
-* Make sure you're on master and synced to HEAD
-* Ensure the project builds and tests run (sanity check only, obviously)
-    * `parallel -j0 exec ::: test/*_test` can help ensure everything at least
-      passes
-* Prepare release notes
-    * `git log $(git describe --abbrev=0 --tags)..HEAD` gives you the list of
-      commits between the last annotated tag and HEAD
-    * Pick the most interesting.
-* Create a release through github's interface
-    * Note this will create a lightweight tag.
-    * Update this to an annotated tag:
-      * `git pull --tags`
-      * `git tag -a -f <tag> <tag>`
-      * `git push --force origin`
diff --git a/ThirdParty/googlebenchmark/src/CMakeLists.txt b/ThirdParty/googlebenchmark/src/CMakeLists.txt
index b47de6791c..daf82fb131 100644
--- a/ThirdParty/googlebenchmark/src/CMakeLists.txt
+++ b/ThirdParty/googlebenchmark/src/CMakeLists.txt
@@ -18,74 +18,97 @@ foreach(item ${BENCHMARK_MAIN})
 endforeach()
 
 add_library(benchmark ${SOURCE_FILES})
+add_library(benchmark::benchmark ALIAS benchmark)
 set_target_properties(benchmark PROPERTIES
   OUTPUT_NAME "benchmark"
   VERSION ${GENERIC_LIB_VERSION}
   SOVERSION ${GENERIC_LIB_SOVERSION}
 )
 target_include_directories(benchmark PUBLIC
-    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include>
-    )
+  $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
+)
 
-# Link threads.
-target_link_libraries(benchmark  ${BENCHMARK_CXX_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
-find_library(LIBRT rt)
-if(LIBRT)
-  target_link_libraries(benchmark ${LIBRT})
+# libpfm, if available
+if (PFM_FOUND)
+  target_link_libraries(benchmark PRIVATE PFM::libpfm)
+  target_compile_definitions(benchmark PRIVATE -DHAVE_LIBPFM)
 endif()
 
-if(CMAKE_BUILD_TYPE)
-  string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UPPER)
-endif()
-if(NOT CMAKE_THREAD_LIBS_INIT AND "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}}" MATCHES ".*-fsanitize=[^ ]*address.*")
-  message(WARNING "CMake's FindThreads.cmake did not fail, but CMAKE_THREAD_LIBS_INIT ended up being empty. This was fixed in https://github.com/Kitware/CMake/commit/d53317130e84898c5328c237186dbd995aaf1c12 Let's guess that -pthread is sufficient.")
-  target_link_libraries(benchmark -pthread)
+# pthread affinity, if available
+if(HAVE_PTHREAD_AFFINITY)
+  target_compile_definitions(benchmark PRIVATE -DBENCHMARK_HAS_PTHREAD_AFFINITY)
 endif()
 
+# Link threads.
+target_link_libraries(benchmark PRIVATE Threads::Threads)
+
+target_link_libraries(benchmark PRIVATE ${BENCHMARK_CXX_LIBRARIES})
+
+if(HAVE_LIB_RT)
+  target_link_libraries(benchmark PRIVATE rt)
+endif(HAVE_LIB_RT)
+
+
 # We need extra libraries on Windows
 if(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
-  target_link_libraries(benchmark Shlwapi)
+  target_link_libraries(benchmark PRIVATE shlwapi)
 endif()
 
 # We need extra libraries on Solaris
 if(${CMAKE_SYSTEM_NAME} MATCHES "SunOS")
-  target_link_libraries(benchmark kstat)
+  target_link_libraries(benchmark PRIVATE kstat)
+endif()
+
+if (NOT BUILD_SHARED_LIBS)
+  target_compile_definitions(benchmark PUBLIC -DBENCHMARK_STATIC_DEFINE)
 endif()
 
 # Benchmark main library
 add_library(benchmark_main "benchmark_main.cc")
+add_library(benchmark::benchmark_main ALIAS benchmark_main)
 set_target_properties(benchmark_main PROPERTIES
   OUTPUT_NAME "benchmark_main"
   VERSION ${GENERIC_LIB_VERSION}
   SOVERSION ${GENERIC_LIB_SOVERSION}
+  DEFINE_SYMBOL benchmark_EXPORTS
 )
-target_include_directories(benchmark PUBLIC
-    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include>
-    )
-target_link_libraries(benchmark_main benchmark)
-
+target_link_libraries(benchmark_main PUBLIC benchmark::benchmark)
 
-set(generated_dir "${CMAKE_CURRENT_BINARY_DIR}/generated")
+set(generated_dir "${PROJECT_BINARY_DIR}")
 
 set(version_config "${generated_dir}/${PROJECT_NAME}ConfigVersion.cmake")
 set(project_config "${generated_dir}/${PROJECT_NAME}Config.cmake")
 set(pkg_config "${generated_dir}/${PROJECT_NAME}.pc")
+set(targets_to_export benchmark benchmark_main)
 set(targets_export_name "${PROJECT_NAME}Targets")
 
 set(namespace "${PROJECT_NAME}::")
 
 include(CMakePackageConfigHelpers)
+
+configure_package_config_file (
+  ${PROJECT_SOURCE_DIR}/cmake/Config.cmake.in
+  ${project_config}
+  INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}
+  NO_SET_AND_CHECK_MACRO
+  NO_CHECK_REQUIRED_COMPONENTS_MACRO
+)
 write_basic_package_version_file(
   "${version_config}" VERSION ${GENERIC_LIB_VERSION} COMPATIBILITY SameMajorVersion
 )
 
-configure_file("${PROJECT_SOURCE_DIR}/cmake/Config.cmake.in" "${project_config}" @ONLY)
 configure_file("${PROJECT_SOURCE_DIR}/cmake/benchmark.pc.in" "${pkg_config}" @ONLY)
 
+export (
+  TARGETS ${targets_to_export}
+  NAMESPACE "${namespace}"
+  FILE ${generated_dir}/${targets_export_name}.cmake
+)
+
 if (BENCHMARK_ENABLE_INSTALL)
   # Install target (will install the library to specified CMAKE_INSTALL_PREFIX variable)
   install(
-    TARGETS benchmark benchmark_main
+    TARGETS ${targets_to_export}
     EXPORT ${targets_export_name}
     ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
     LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
@@ -94,6 +117,7 @@ if (BENCHMARK_ENABLE_INSTALL)
 
   install(
     DIRECTORY "${PROJECT_SOURCE_DIR}/include/benchmark"
+              "${PROJECT_BINARY_DIR}/include/benchmark"
     DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
     FILES_MATCHING PATTERN "*.*h")
 
@@ -110,3 +134,37 @@ if (BENCHMARK_ENABLE_INSTALL)
       NAMESPACE "${namespace}"
       DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}")
 endif()
+
+if (BENCHMARK_ENABLE_DOXYGEN)
+  find_package(Doxygen REQUIRED)
+  set(DOXYGEN_QUIET YES)
+  set(DOXYGEN_RECURSIVE YES)
+  set(DOXYGEN_GENERATE_HTML YES)
+  set(DOXYGEN_GENERATE_MAN NO)
+  set(DOXYGEN_MARKDOWN_SUPPORT YES)
+  set(DOXYGEN_BUILTIN_STL_SUPPORT YES)
+  set(DOXYGEN_EXTRACT_PACKAGE YES)
+  set(DOXYGEN_EXTRACT_STATIC YES)
+  set(DOXYGEN_SHOW_INCLUDE_FILES YES)
+  set(DOXYGEN_BINARY_TOC YES)
+  set(DOXYGEN_TOC_EXPAND YES)
+  set(DOXYGEN_USE_MDFILE_AS_MAINPAGE "index.md")
+  doxygen_add_docs(benchmark_doxygen
+    docs
+    include
+    src
+    ALL
+    WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
+    COMMENT "Building documentation with Doxygen.")
+  if (BENCHMARK_ENABLE_INSTALL AND BENCHMARK_INSTALL_DOCS)
+    install(
+      DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/html/"
+      DESTINATION ${CMAKE_INSTALL_DOCDIR})
+  endif()
+else()
+  if (BENCHMARK_ENABLE_INSTALL AND BENCHMARK_INSTALL_DOCS)
+    install(
+      DIRECTORY "${PROJECT_SOURCE_DIR}/docs/"
+      DESTINATION ${CMAKE_INSTALL_DOCDIR})
+  endif()
+endif()
diff --git a/ThirdParty/googlebenchmark/src/benchmark.cc b/ThirdParty/googlebenchmark/src/benchmark.cc
index 29bfa3512f..6139e59d05 100644
--- a/ThirdParty/googlebenchmark/src/benchmark.cc
+++ b/ThirdParty/googlebenchmark/src/benchmark.cc
@@ -13,12 +13,13 @@
 // limitations under the License.
 
 #include "benchmark/benchmark.h"
+
 #include "benchmark_api_internal.h"
 #include "benchmark_runner.h"
 #include "internal_macros.h"
 
 #ifndef BENCHMARK_OS_WINDOWS
-#ifndef BENCHMARK_OS_FUCHSIA
+#if !defined(BENCHMARK_OS_FUCHSIA) && !defined(BENCHMARK_OS_QURT)
 #include <sys/resource.h>
 #endif
 #include <sys/time.h>
@@ -32,7 +33,10 @@
 #include <cstdlib>
 #include <fstream>
 #include <iostream>
+#include <limits>
+#include <map>
 #include <memory>
+#include <random>
 #include <string>
 #include <thread>
 #include <utility>
@@ -45,100 +49,146 @@
 #include "internal_macros.h"
 #include "log.h"
 #include "mutex.h"
+#include "perf_counters.h"
 #include "re.h"
 #include "statistics.h"
 #include "string_util.h"
 #include "thread_manager.h"
 #include "thread_timer.h"
 
-DEFINE_bool(benchmark_list_tests, false,
-            "Print a list of benchmarks. This option overrides all other "
-            "options.");
-
-DEFINE_string(benchmark_filter, ".",
-              "A regular expression that specifies the set of benchmarks "
-              "to execute.  If this flag is empty, or if this flag is the "
-              "string \"all\", all benchmarks linked into the binary are "
-              "run.");
-
-DEFINE_double(benchmark_min_time, 0.5,
-              "Minimum number of seconds we should run benchmark before "
-              "results are considered significant.  For cpu-time based "
-              "tests, this is the lower bound on the total cpu time "
-              "used by all threads that make up the test.  For real-time "
-              "based tests, this is the lower bound on the elapsed time "
-              "of the benchmark execution, regardless of number of "
-              "threads.");
-
-DEFINE_int32(benchmark_repetitions, 1,
-             "The number of runs of each benchmark. If greater than 1, the "
-             "mean and standard deviation of the runs will be reported.");
-
-DEFINE_bool(
-    benchmark_report_aggregates_only, false,
-    "Report the result of each benchmark repetitions. When 'true' is specified "
-    "only the mean, standard deviation, and other statistics are reported for "
-    "repeated benchmarks. Affects all reporters.");
-
-DEFINE_bool(
-    benchmark_display_aggregates_only, false,
-    "Display the result of each benchmark repetitions. When 'true' is "
-    "specified only the mean, standard deviation, and other statistics are "
-    "displayed for repeated benchmarks. Unlike "
-    "benchmark_report_aggregates_only, only affects the display reporter, but "
-    "*NOT* file reporter, which will still contain all the output.");
-
-DEFINE_string(benchmark_format, "console",
-              "The format to use for console output. Valid values are "
-              "'console', 'json', or 'csv'.");
-
-DEFINE_string(benchmark_out_format, "json",
-              "The format to use for file output. Valid values are "
-              "'console', 'json', or 'csv'.");
-
-DEFINE_string(benchmark_out, "", "The file to write additional output to");
-
-DEFINE_string(benchmark_color, "auto",
-              "Whether to use colors in the output.  Valid values: "
-              "'true'/'yes'/1, 'false'/'no'/0, and 'auto'. 'auto' means to use "
-              "colors if the output is being sent to a terminal and the TERM "
-              "environment variable is set to a terminal type that supports "
-              "colors.");
-
-DEFINE_bool(benchmark_counters_tabular, false,
-            "Whether to use tabular format when printing user counters to "
-            "the console.  Valid values: 'true'/'yes'/1, 'false'/'no'/0."
-            "Defaults to false.");
-
-DEFINE_int32(v, 0, "The level of verbose logging to output");
-
 namespace benchmark {
+// Print a list of benchmarks. This option overrides all other options.
+BM_DEFINE_bool(benchmark_list_tests, false);
+
+// A regular expression that specifies the set of benchmarks to execute.  If
+// this flag is empty, or if this flag is the string \"all\", all benchmarks
+// linked into the binary are run.
+BM_DEFINE_string(benchmark_filter, "");
+
+// Specification of how long to run the benchmark.
+//
+// It can be either an exact number of iterations (specified as `<integer>x`),
+// or a minimum number of seconds (specified as `<float>s`). If the latter
+// format (ie., min seconds) is used, the system may run the benchmark longer
+// until the results are considered significant.
+//
+// For backward compatibility, the `s` suffix may be omitted, in which case,
+// the specified number is interpreted as the number of seconds.
+//
+// For cpu-time based tests, this is the lower bound
+// on the total cpu time used by all threads that make up the test.  For
+// real-time based tests, this is the lower bound on the elapsed time of the
+// benchmark execution, regardless of number of threads.
+BM_DEFINE_string(benchmark_min_time, kDefaultMinTimeStr);
+
+// Minimum number of seconds a benchmark should be run before results should be
+// taken into account. This e.g can be necessary for benchmarks of code which
+// needs to fill some form of cache before performance is of interest.
+// Note: results gathered within this period are discarded and not used for
+// reported result.
+BM_DEFINE_double(benchmark_min_warmup_time, 0.0);
+
+// The number of runs of each benchmark. If greater than 1, the mean and
+// standard deviation of the runs will be reported.
+BM_DEFINE_int32(benchmark_repetitions, 1);
+
+// If set, enable random interleaving of repetitions of all benchmarks.
+// See http://github.com/google/benchmark/issues/1051 for details.
+BM_DEFINE_bool(benchmark_enable_random_interleaving, false);
+
+// Report the result of each benchmark repetitions. When 'true' is specified
+// only the mean, standard deviation, and other statistics are reported for
+// repeated benchmarks. Affects all reporters.
+BM_DEFINE_bool(benchmark_report_aggregates_only, false);
+
+// Display the result of each benchmark repetitions. When 'true' is specified
+// only the mean, standard deviation, and other statistics are displayed for
+// repeated benchmarks. Unlike benchmark_report_aggregates_only, only affects
+// the display reporter, but  *NOT* file reporter, which will still contain
+// all the output.
+BM_DEFINE_bool(benchmark_display_aggregates_only, false);
+
+// The format to use for console output.
+// Valid values are 'console', 'json', or 'csv'.
+BM_DEFINE_string(benchmark_format, "console");
+
+// The format to use for file output.
+// Valid values are 'console', 'json', or 'csv'.
+BM_DEFINE_string(benchmark_out_format, "json");
+
+// The file to write additional output to.
+BM_DEFINE_string(benchmark_out, "");
+
+// Whether to use colors in the output.  Valid values:
+// 'true'/'yes'/1, 'false'/'no'/0, and 'auto'. 'auto' means to use colors if
+// the output is being sent to a terminal and the TERM environment variable is
+// set to a terminal type that supports colors.
+BM_DEFINE_string(benchmark_color, "auto");
+
+// Whether to use tabular format when printing user counters to the console.
+// Valid values: 'true'/'yes'/1, 'false'/'no'/0.  Defaults to false.
+BM_DEFINE_bool(benchmark_counters_tabular, false);
+
+// List of additional perf counters to collect, in libpfm format. For more
+// information about libpfm: https://man7.org/linux/man-pages/man3/libpfm.3.html
+BM_DEFINE_string(benchmark_perf_counters, "");
+
+// Extra context to include in the output formatted as comma-separated key-value
+// pairs. Kept internal as it's only used for parsing from env/command line.
+BM_DEFINE_kvpairs(benchmark_context, {});
+
+// Set the default time unit to use for reports
+// Valid values are 'ns', 'us', 'ms' or 's'
+BM_DEFINE_string(benchmark_time_unit, "");
+
+// The level of verbose logging to output
+BM_DEFINE_int32(v, 0);
 
 namespace internal {
 
+std::map<std::string, std::string>* global_context = nullptr;
+
+BENCHMARK_EXPORT std::map<std::string, std::string>*& GetGlobalContext() {
+  return global_context;
+}
+
 // FIXME: wouldn't LTO mess this up?
 void UseCharPointer(char const volatile*) {}
 
 }  // namespace internal
 
-State::State(IterationCount max_iters, const std::vector<int64_t>& ranges,
-             int thread_i, int n_threads, internal::ThreadTimer* timer,
-             internal::ThreadManager* manager)
+State::State(std::string name, IterationCount max_iters,
+             const std::vector<int64_t>& ranges, int thread_i, int n_threads,
+             internal::ThreadTimer* timer, internal::ThreadManager* manager,
+             internal::PerfCountersMeasurement* perf_counters_measurement)
     : total_iterations_(0),
       batch_leftover_(0),
       max_iterations(max_iters),
       started_(false),
       finished_(false),
-      error_occurred_(false),
+      skipped_(internal::NotSkipped),
       range_(ranges),
       complexity_n_(0),
-      counters(),
-      thread_index(thread_i),
-      threads(n_threads),
+      name_(std::move(name)),
+      thread_index_(thread_i),
+      threads_(n_threads),
       timer_(timer),
-      manager_(manager) {
-  CHECK(max_iterations != 0) << "At least one iteration must be run";
-  CHECK_LT(thread_index, threads) << "thread_index must be less than threads";
+      manager_(manager),
+      perf_counters_measurement_(perf_counters_measurement) {
+  BM_CHECK(max_iterations != 0) << "At least one iteration must be run";
+  BM_CHECK_LT(thread_index_, threads_)
+      << "thread_index must be less than threads";
+
+  // Add counters with correct flag now.  If added with `counters[name]` in
+  // `PauseTiming`, a new `Counter` will be inserted the first time, which
+  // won't have the flag.  Inserting them now also reduces the allocations
+  // during the benchmark.
+  if (perf_counters_measurement_) {
+    for (const std::string& counter_name :
+         perf_counters_measurement_->names()) {
+      counters[counter_name] = Counter(0.0, Counter::kAvgIterations);
+    }
+  }
 
   // Note: The use of offsetof below is technically undefined until C++17
   // because State is not a standard layout type. However, all compilers
@@ -148,42 +198,83 @@ State::State(IterationCount max_iters, const std::vector<int64_t>& ranges,
   // which must be suppressed.
 #if defined(__INTEL_COMPILER)
 #pragma warning push
-#pragma warning(disable:1875)
+#pragma warning(disable : 1875)
 #elif defined(__GNUC__)
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Winvalid-offsetof"
+#endif
+#if defined(__NVCC__)
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 1427
+#endif
+#if defined(__NVCOMPILER)
+#pragma diagnostic push
+#pragma diag_suppress offset_in_non_POD_nonstandard
 #endif
   // Offset tests to ensure commonly accessed data is on the first cache line.
   const int cache_line_size = 64;
-  static_assert(offsetof(State, error_occurred_) <=
-                    (cache_line_size - sizeof(error_occurred_)),
-                "");
+  static_assert(
+      offsetof(State, skipped_) <= (cache_line_size - sizeof(skipped_)), "");
 #if defined(__INTEL_COMPILER)
 #pragma warning pop
 #elif defined(__GNUC__)
 #pragma GCC diagnostic pop
 #endif
+#if defined(__NVCC__)
+#pragma nv_diagnostic pop
+#endif
+#if defined(__NVCOMPILER)
+#pragma diagnostic pop
+#endif
 }
 
 void State::PauseTiming() {
   // Add in time accumulated so far
-  CHECK(started_ && !finished_ && !error_occurred_);
+  BM_CHECK(started_ && !finished_ && !skipped());
   timer_->StopTimer();
+  if (perf_counters_measurement_) {
+    std::vector<std::pair<std::string, double>> measurements;
+    if (!perf_counters_measurement_->Stop(measurements)) {
+      BM_CHECK(false) << "Perf counters read the value failed.";
+    }
+    for (const auto& name_and_measurement : measurements) {
+      const std::string& name = name_and_measurement.first;
+      const double measurement = name_and_measurement.second;
+      // Counter was inserted with `kAvgIterations` flag by the constructor.
+      assert(counters.find(name) != counters.end());
+      counters[name].value += measurement;
+    }
+  }
 }
 
 void State::ResumeTiming() {
-  CHECK(started_ && !finished_ && !error_occurred_);
+  BM_CHECK(started_ && !finished_ && !skipped());
   timer_->StartTimer();
+  if (perf_counters_measurement_) {
+    perf_counters_measurement_->Start();
+  }
+}
+
+void State::SkipWithMessage(const std::string& msg) {
+  skipped_ = internal::SkippedWithMessage;
+  {
+    MutexLock l(manager_->GetBenchmarkMutex());
+    if (internal::NotSkipped == manager_->results.skipped_) {
+      manager_->results.skip_message_ = msg;
+      manager_->results.skipped_ = skipped_;
+    }
+  }
+  total_iterations_ = 0;
+  if (timer_->running()) timer_->StopTimer();
 }
 
-void State::SkipWithError(const char* msg) {
-  CHECK(msg);
-  error_occurred_ = true;
+void State::SkipWithError(const std::string& msg) {
+  skipped_ = internal::SkippedWithError;
   {
     MutexLock l(manager_->GetBenchmarkMutex());
-    if (manager_->results.has_error_ == false) {
-      manager_->results.error_message_ = msg;
-      manager_->results.has_error_ = true;
+    if (internal::NotSkipped == manager_->results.skipped_) {
+      manager_->results.skip_message_ = msg;
+      manager_->results.skipped_ = skipped_;
     }
   }
   total_iterations_ = 0;
@@ -194,22 +285,22 @@ void State::SetIterationTime(double seconds) {
   timer_->SetIterationTime(seconds);
 }
 
-void State::SetLabel(const char* label) {
+void State::SetLabel(const std::string& label) {
   MutexLock l(manager_->GetBenchmarkMutex());
   manager_->results.report_label_ = label;
 }
 
 void State::StartKeepRunning() {
-  CHECK(!started_ && !finished_);
+  BM_CHECK(!started_ && !finished_);
   started_ = true;
-  total_iterations_ = error_occurred_ ? 0 : max_iterations;
+  total_iterations_ = skipped() ? 0 : max_iterations;
   manager_->StartStopBarrier();
-  if (!error_occurred_) ResumeTiming();
+  if (!skipped()) ResumeTiming();
 }
 
 void State::FinishKeepRunning() {
-  CHECK(started_ && (!finished_ || error_occurred_));
-  if (!error_occurred_) {
+  BM_CHECK(started_ && (!finished_ || skipped()));
+  if (!skipped()) {
     PauseTiming();
   }
   // Total iterations has now wrapped around past 0. Fix this.
@@ -221,11 +312,42 @@ void State::FinishKeepRunning() {
 namespace internal {
 namespace {
 
+// Flushes streams after invoking reporter methods that write to them. This
+// ensures users get timely updates even when streams are not line-buffered.
+void FlushStreams(BenchmarkReporter* reporter) {
+  if (!reporter) return;
+  std::flush(reporter->GetOutputStream());
+  std::flush(reporter->GetErrorStream());
+}
+
+// Reports in both display and file reporters.
+void Report(BenchmarkReporter* display_reporter,
+            BenchmarkReporter* file_reporter, const RunResults& run_results) {
+  auto report_one = [](BenchmarkReporter* reporter, bool aggregates_only,
+                       const RunResults& results) {
+    assert(reporter);
+    // If there are no aggregates, do output non-aggregates.
+    aggregates_only &= !results.aggregates_only.empty();
+    if (!aggregates_only) reporter->ReportRuns(results.non_aggregates);
+    if (!results.aggregates_only.empty())
+      reporter->ReportRuns(results.aggregates_only);
+  };
+
+  report_one(display_reporter, run_results.display_report_aggregates_only,
+             run_results);
+  if (file_reporter)
+    report_one(file_reporter, run_results.file_report_aggregates_only,
+               run_results);
+
+  FlushStreams(display_reporter);
+  FlushStreams(file_reporter);
+}
+
 void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
                    BenchmarkReporter* display_reporter,
                    BenchmarkReporter* file_reporter) {
   // Note the file_reporter can be null.
-  CHECK(display_reporter != nullptr);
+  BM_CHECK(display_reporter != nullptr);
 
   // Determine the width of the name field using a minimum width of 10.
   bool might_have_aggregates = FLAGS_benchmark_repetitions > 1;
@@ -233,10 +355,10 @@ void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
   size_t stat_field_width = 0;
   for (const BenchmarkInstance& benchmark : benchmarks) {
     name_field_width =
-        std::max<size_t>(name_field_width, benchmark.name.str().size());
-    might_have_aggregates |= benchmark.repetitions > 1;
+        std::max<size_t>(name_field_width, benchmark.name().str().size());
+    might_have_aggregates |= benchmark.repetitions() > 1;
 
-    for (const auto& Stat : *benchmark.statistics)
+    for (const auto& Stat : benchmark.statistics())
       stat_field_width = std::max<size_t>(stat_field_width, Stat.name_.size());
   }
   if (might_have_aggregates) name_field_width += 1 + stat_field_width;
@@ -245,65 +367,130 @@ void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
   BenchmarkReporter::Context context;
   context.name_field_width = name_field_width;
 
-  // Keep track of running times of all instances of current benchmark
-  std::vector<BenchmarkReporter::Run> complexity_reports;
-
-  // We flush streams after invoking reporter methods that write to them. This
-  // ensures users get timely updates even when streams are not line-buffered.
-  auto flushStreams = [](BenchmarkReporter* reporter) {
-    if (!reporter) return;
-    std::flush(reporter->GetOutputStream());
-    std::flush(reporter->GetErrorStream());
-  };
+  // Keep track of running times of all instances of each benchmark family.
+  std::map<int /*family_index*/, BenchmarkReporter::PerFamilyRunReports>
+      per_family_reports;
 
   if (display_reporter->ReportContext(context) &&
       (!file_reporter || file_reporter->ReportContext(context))) {
-    flushStreams(display_reporter);
-    flushStreams(file_reporter);
-
-    for (const auto& benchmark : benchmarks) {
-      RunResults run_results = RunBenchmark(benchmark, &complexity_reports);
-
-      auto report = [&run_results](BenchmarkReporter* reporter,
-                                   bool report_aggregates_only) {
-        assert(reporter);
-        // If there are no aggregates, do output non-aggregates.
-        report_aggregates_only &= !run_results.aggregates_only.empty();
-        if (!report_aggregates_only)
-          reporter->ReportRuns(run_results.non_aggregates);
-        if (!run_results.aggregates_only.empty())
-          reporter->ReportRuns(run_results.aggregates_only);
-      };
-
-      report(display_reporter, run_results.display_report_aggregates_only);
-      if (file_reporter)
-        report(file_reporter, run_results.file_report_aggregates_only);
+    FlushStreams(display_reporter);
+    FlushStreams(file_reporter);
+
+    size_t num_repetitions_total = 0;
+
+    // This perfcounters object needs to be created before the runners vector
+    // below so it outlasts their lifetime.
+    PerfCountersMeasurement perfcounters(
+        StrSplit(FLAGS_benchmark_perf_counters, ','));
+
+    // Vector of benchmarks to run
+    std::vector<internal::BenchmarkRunner> runners;
+    runners.reserve(benchmarks.size());
+
+    // Count the number of benchmarks with threads to warn the user in case
+    // performance counters are used.
+    int benchmarks_with_threads = 0;
+
+    // Loop through all benchmarks
+    for (const BenchmarkInstance& benchmark : benchmarks) {
+      BenchmarkReporter::PerFamilyRunReports* reports_for_family = nullptr;
+      if (benchmark.complexity() != oNone)
+        reports_for_family = &per_family_reports[benchmark.family_index()];
+      benchmarks_with_threads += (benchmark.threads() > 1);
+      runners.emplace_back(benchmark, &perfcounters, reports_for_family);
+      int num_repeats_of_this_instance = runners.back().GetNumRepeats();
+      num_repetitions_total += num_repeats_of_this_instance;
+      if (reports_for_family)
+        reports_for_family->num_runs_total += num_repeats_of_this_instance;
+    }
+    assert(runners.size() == benchmarks.size() && "Unexpected runner count.");
+
+    // The use of performance counters with threads would be unintuitive for
+    // the average user so we need to warn them about this case
+    if ((benchmarks_with_threads > 0) && (perfcounters.num_counters() > 0)) {
+      GetErrorLogInstance()
+          << "***WARNING*** There are " << benchmarks_with_threads
+          << " benchmarks with threads and " << perfcounters.num_counters()
+          << " performance counters were requested. Beware counters will "
+             "reflect the combined usage across all "
+             "threads.\n";
+    }
 
-      flushStreams(display_reporter);
-      flushStreams(file_reporter);
+    std::vector<size_t> repetition_indices;
+    repetition_indices.reserve(num_repetitions_total);
+    for (size_t runner_index = 0, num_runners = runners.size();
+         runner_index != num_runners; ++runner_index) {
+      const internal::BenchmarkRunner& runner = runners[runner_index];
+      std::fill_n(std::back_inserter(repetition_indices),
+                  runner.GetNumRepeats(), runner_index);
+    }
+    assert(repetition_indices.size() == num_repetitions_total &&
+           "Unexpected number of repetition indexes.");
+
+    if (FLAGS_benchmark_enable_random_interleaving) {
+      std::random_device rd;
+      std::mt19937 g(rd());
+      std::shuffle(repetition_indices.begin(), repetition_indices.end(), g);
+    }
+
+    for (size_t repetition_index : repetition_indices) {
+      internal::BenchmarkRunner& runner = runners[repetition_index];
+      runner.DoOneRepetition();
+      if (runner.HasRepeatsRemaining()) continue;
+      // FIXME: report each repetition separately, not all of them in bulk.
+
+      display_reporter->ReportRunsConfig(
+          runner.GetMinTime(), runner.HasExplicitIters(), runner.GetIters());
+      if (file_reporter)
+        file_reporter->ReportRunsConfig(
+            runner.GetMinTime(), runner.HasExplicitIters(), runner.GetIters());
+
+      RunResults run_results = runner.GetResults();
+
+      // Maybe calculate complexity report
+      if (const auto* reports_for_family = runner.GetReportsForFamily()) {
+        if (reports_for_family->num_runs_done ==
+            reports_for_family->num_runs_total) {
+          auto additional_run_stats = ComputeBigO(reports_for_family->Runs);
+          run_results.aggregates_only.insert(run_results.aggregates_only.end(),
+                                             additional_run_stats.begin(),
+                                             additional_run_stats.end());
+          per_family_reports.erase(
+              static_cast<int>(reports_for_family->Runs.front().family_index));
+        }
+      }
+
+      Report(display_reporter, file_reporter, run_results);
     }
   }
   display_reporter->Finalize();
   if (file_reporter) file_reporter->Finalize();
-  flushStreams(display_reporter);
-  flushStreams(file_reporter);
+  FlushStreams(display_reporter);
+  FlushStreams(file_reporter);
 }
 
+// Disable deprecated warnings temporarily because we need to reference
+// CSVReporter but don't want to trigger -Werror=-Wdeprecated-declarations
+BENCHMARK_DISABLE_DEPRECATED_WARNING
+
 std::unique_ptr<BenchmarkReporter> CreateReporter(
     std::string const& name, ConsoleReporter::OutputOptions output_opts) {
   typedef std::unique_ptr<BenchmarkReporter> PtrType;
   if (name == "console") {
     return PtrType(new ConsoleReporter(output_opts));
-  } else if (name == "json") {
-    return PtrType(new JSONReporter);
-  } else if (name == "csv") {
-    return PtrType(new CSVReporter);
-  } else {
-    std::cerr << "Unexpected format: '" << name << "'\n";
-    std::exit(1);
   }
+  if (name == "json") {
+    return PtrType(new JSONReporter());
+  }
+  if (name == "csv") {
+    return PtrType(new CSVReporter());
+  }
+  std::cerr << "Unexpected format: '" << name << "'\n";
+  std::exit(1);
 }
 
+BENCHMARK_RESTORE_DEPRECATED_WARNING
+
 }  // end namespace
 
 bool IsZero(double n) {
@@ -312,7 +499,7 @@ bool IsZero(double n) {
 
 ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color) {
   int output_opts = ConsoleReporter::OO_Defaults;
-  auto is_benchmark_color = [force_no_color] () -> bool {
+  auto is_benchmark_color = [force_no_color]() -> bool {
     if (force_no_color) {
       return false;
     }
@@ -336,17 +523,41 @@ ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color) {
 
 }  // end namespace internal
 
+BenchmarkReporter* CreateDefaultDisplayReporter() {
+  static auto default_display_reporter =
+      internal::CreateReporter(FLAGS_benchmark_format,
+                               internal::GetOutputOptions())
+          .release();
+  return default_display_reporter;
+}
+
 size_t RunSpecifiedBenchmarks() {
-  return RunSpecifiedBenchmarks(nullptr, nullptr);
+  return RunSpecifiedBenchmarks(nullptr, nullptr, FLAGS_benchmark_filter);
+}
+
+size_t RunSpecifiedBenchmarks(std::string spec) {
+  return RunSpecifiedBenchmarks(nullptr, nullptr, std::move(spec));
 }
 
 size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter) {
-  return RunSpecifiedBenchmarks(display_reporter, nullptr);
+  return RunSpecifiedBenchmarks(display_reporter, nullptr,
+                                FLAGS_benchmark_filter);
+}
+
+size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
+                              std::string spec) {
+  return RunSpecifiedBenchmarks(display_reporter, nullptr, std::move(spec));
 }
 
 size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
                               BenchmarkReporter* file_reporter) {
-  std::string spec = FLAGS_benchmark_filter;
+  return RunSpecifiedBenchmarks(display_reporter, file_reporter,
+                                FLAGS_benchmark_filter);
+}
+
+size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
+                              BenchmarkReporter* file_reporter,
+                              std::string spec) {
   if (spec.empty() || spec == "all")
     spec = ".";  // Regexp that matches all benchmarks
 
@@ -355,8 +566,7 @@ size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
   std::unique_ptr<BenchmarkReporter> default_display_reporter;
   std::unique_ptr<BenchmarkReporter> default_file_reporter;
   if (!display_reporter) {
-    default_display_reporter = internal::CreateReporter(
-        FLAGS_benchmark_format, internal::GetOutputOptions());
+    default_display_reporter.reset(CreateDefaultDisplayReporter());
     display_reporter = default_display_reporter.get();
   }
   auto& Out = display_reporter->GetOutputStream();
@@ -372,12 +582,14 @@ size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
   if (!fname.empty()) {
     output_file.open(fname);
     if (!output_file.is_open()) {
-      Err << "invalid file name: '" << fname << std::endl;
+      Err << "invalid file name: '" << fname << "'" << std::endl;
       std::exit(1);
     }
     if (!file_reporter) {
       default_file_reporter = internal::CreateReporter(
-          FLAGS_benchmark_out_format, ConsoleReporter::OO_None);
+          FLAGS_benchmark_out_format, FLAGS_benchmark_counters_tabular
+                                          ? ConsoleReporter::OO_Tabular
+                                          : ConsoleReporter::OO_None);
       file_reporter = default_file_reporter.get();
     }
     file_reporter->SetOutputStream(&output_file);
@@ -394,7 +606,7 @@ size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
 
   if (FLAGS_benchmark_list_tests) {
     for (auto const& benchmark : benchmarks)
-      Out << benchmark.name.str() << "\n";
+      Out << benchmark.name().str() << "\n";
   } else {
     internal::RunBenchmarks(benchmarks, display_reporter, file_reporter);
   }
@@ -402,42 +614,80 @@ size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
   return benchmarks.size();
 }
 
+namespace {
+// stores the time unit benchmarks use by default
+TimeUnit default_time_unit = kNanosecond;
+}  // namespace
+
+TimeUnit GetDefaultTimeUnit() { return default_time_unit; }
+
+void SetDefaultTimeUnit(TimeUnit unit) { default_time_unit = unit; }
+
+std::string GetBenchmarkFilter() { return FLAGS_benchmark_filter; }
+
+void SetBenchmarkFilter(std::string value) {
+  FLAGS_benchmark_filter = std::move(value);
+}
+
+int32_t GetBenchmarkVerbosity() { return FLAGS_v; }
+
 void RegisterMemoryManager(MemoryManager* manager) {
   internal::memory_manager = manager;
 }
 
+void AddCustomContext(const std::string& key, const std::string& value) {
+  if (internal::global_context == nullptr) {
+    internal::global_context = new std::map<std::string, std::string>();
+  }
+  if (!internal::global_context->emplace(key, value).second) {
+    std::cerr << "Failed to add custom context \"" << key << "\" as it already "
+              << "exists with value \"" << value << "\"\n";
+  }
+}
+
 namespace internal {
 
+void (*HelperPrintf)();
+
 void PrintUsageAndExit() {
-  fprintf(stdout,
-          "benchmark"
-          " [--benchmark_list_tests={true|false}]\n"
-          "          [--benchmark_filter=<regex>]\n"
-          "          [--benchmark_min_time=<min_time>]\n"
-          "          [--benchmark_repetitions=<num_repetitions>]\n"
-          "          [--benchmark_report_aggregates_only={true|false}]\n"
-          "          [--benchmark_display_aggregates_only={true|false}]\n"
-          "          [--benchmark_format=<console|json|csv>]\n"
-          "          [--benchmark_out=<filename>]\n"
-          "          [--benchmark_out_format=<json|console|csv>]\n"
-          "          [--benchmark_color={auto|true|false}]\n"
-          "          [--benchmark_counters_tabular={true|false}]\n"
-          "          [--v=<verbosity>]\n");
+  HelperPrintf();
   exit(0);
 }
 
+void SetDefaultTimeUnitFromFlag(const std::string& time_unit_flag) {
+  if (time_unit_flag == "s") {
+    return SetDefaultTimeUnit(kSecond);
+  }
+  if (time_unit_flag == "ms") {
+    return SetDefaultTimeUnit(kMillisecond);
+  }
+  if (time_unit_flag == "us") {
+    return SetDefaultTimeUnit(kMicrosecond);
+  }
+  if (time_unit_flag == "ns") {
+    return SetDefaultTimeUnit(kNanosecond);
+  }
+  if (!time_unit_flag.empty()) {
+    PrintUsageAndExit();
+  }
+}
+
 void ParseCommandLineFlags(int* argc, char** argv) {
   using namespace benchmark;
   BenchmarkReporter::Context::executable_name =
       (argc && *argc > 0) ? argv[0] : "unknown";
-  for (int i = 1; i < *argc; ++i) {
+  for (int i = 1; argc && i < *argc; ++i) {
     if (ParseBoolFlag(argv[i], "benchmark_list_tests",
                       &FLAGS_benchmark_list_tests) ||
         ParseStringFlag(argv[i], "benchmark_filter", &FLAGS_benchmark_filter) ||
-        ParseDoubleFlag(argv[i], "benchmark_min_time",
+        ParseStringFlag(argv[i], "benchmark_min_time",
                         &FLAGS_benchmark_min_time) ||
+        ParseDoubleFlag(argv[i], "benchmark_min_warmup_time",
+                        &FLAGS_benchmark_min_warmup_time) ||
         ParseInt32Flag(argv[i], "benchmark_repetitions",
                        &FLAGS_benchmark_repetitions) ||
+        ParseBoolFlag(argv[i], "benchmark_enable_random_interleaving",
+                      &FLAGS_benchmark_enable_random_interleaving) ||
         ParseBoolFlag(argv[i], "benchmark_report_aggregates_only",
                       &FLAGS_benchmark_report_aggregates_only) ||
         ParseBoolFlag(argv[i], "benchmark_display_aggregates_only",
@@ -447,11 +697,14 @@ void ParseCommandLineFlags(int* argc, char** argv) {
         ParseStringFlag(argv[i], "benchmark_out_format",
                         &FLAGS_benchmark_out_format) ||
         ParseStringFlag(argv[i], "benchmark_color", &FLAGS_benchmark_color) ||
-        // "color_print" is the deprecated name for "benchmark_color".
-        // TODO: Remove this.
-        ParseStringFlag(argv[i], "color_print", &FLAGS_benchmark_color) ||
         ParseBoolFlag(argv[i], "benchmark_counters_tabular",
                       &FLAGS_benchmark_counters_tabular) ||
+        ParseStringFlag(argv[i], "benchmark_perf_counters",
+                        &FLAGS_benchmark_perf_counters) ||
+        ParseKeyValueFlag(argv[i], "benchmark_context",
+                          &FLAGS_benchmark_context) ||
+        ParseStringFlag(argv[i], "benchmark_time_unit",
+                        &FLAGS_benchmark_time_unit) ||
         ParseInt32Flag(argv[i], "v", &FLAGS_v)) {
       for (int j = i; j != *argc - 1; ++j) argv[j] = argv[j + 1];
 
@@ -462,13 +715,18 @@ void ParseCommandLineFlags(int* argc, char** argv) {
     }
   }
   for (auto const* flag :
-       {&FLAGS_benchmark_format, &FLAGS_benchmark_out_format})
+       {&FLAGS_benchmark_format, &FLAGS_benchmark_out_format}) {
     if (*flag != "console" && *flag != "json" && *flag != "csv") {
       PrintUsageAndExit();
     }
+  }
+  SetDefaultTimeUnitFromFlag(FLAGS_benchmark_time_unit);
   if (FLAGS_benchmark_color.empty()) {
     PrintUsageAndExit();
   }
+  for (const auto& kv : FLAGS_benchmark_context) {
+    AddCustomContext(kv.first, kv.second);
+  }
 }
 
 int InitializeStreams() {
@@ -478,11 +736,38 @@ int InitializeStreams() {
 
 }  // end namespace internal
 
-void Initialize(int* argc, char** argv) {
+void PrintDefaultHelp() {
+  fprintf(stdout,
+          "benchmark"
+          " [--benchmark_list_tests={true|false}]\n"
+          "          [--benchmark_filter=<regex>]\n"
+          "          [--benchmark_min_time=`<integer>x` OR `<float>s` ]\n"
+          "          [--benchmark_min_warmup_time=<min_warmup_time>]\n"
+          "          [--benchmark_repetitions=<num_repetitions>]\n"
+          "          [--benchmark_enable_random_interleaving={true|false}]\n"
+          "          [--benchmark_report_aggregates_only={true|false}]\n"
+          "          [--benchmark_display_aggregates_only={true|false}]\n"
+          "          [--benchmark_format=<console|json|csv>]\n"
+          "          [--benchmark_out=<filename>]\n"
+          "          [--benchmark_out_format=<json|console|csv>]\n"
+          "          [--benchmark_color={auto|true|false}]\n"
+          "          [--benchmark_counters_tabular={true|false}]\n"
+#if defined HAVE_LIBPFM
+          "          [--benchmark_perf_counters=<counter>,...]\n"
+#endif
+          "          [--benchmark_context=<key>=<value>,...]\n"
+          "          [--benchmark_time_unit={ns|us|ms|s}]\n"
+          "          [--v=<verbosity>]\n");
+}
+
+void Initialize(int* argc, char** argv, void (*HelperPrintf)()) {
+  internal::HelperPrintf = HelperPrintf;
   internal::ParseCommandLineFlags(argc, argv);
   internal::LogLevel() = FLAGS_v;
 }
 
+void Shutdown() { delete internal::global_context; }
+
 bool ReportUnrecognizedArguments(int argc, char** argv) {
   for (int i = 1; i < argc; ++i) {
     fprintf(stderr, "%s: error: unrecognized command-line flag: %s\n", argv[0],
diff --git a/ThirdParty/googlebenchmark/src/benchmark_api_internal.cc b/ThirdParty/googlebenchmark/src/benchmark_api_internal.cc
index d468a257e3..286f986530 100644
--- a/ThirdParty/googlebenchmark/src/benchmark_api_internal.cc
+++ b/ThirdParty/googlebenchmark/src/benchmark_api_internal.cc
@@ -1,15 +1,118 @@
 #include "benchmark_api_internal.h"
 
+#include <cinttypes>
+
+#include "string_util.h"
+
 namespace benchmark {
 namespace internal {
 
-State BenchmarkInstance::Run(IterationCount iters, int thread_id,
-                             internal::ThreadTimer* timer,
-                             internal::ThreadManager* manager) const {
-  State st(iters, arg, thread_id, threads, timer, manager);
-  benchmark->Run(st);
+BenchmarkInstance::BenchmarkInstance(Benchmark* benchmark, int family_idx,
+                                     int per_family_instance_idx,
+                                     const std::vector<int64_t>& args,
+                                     int thread_count)
+    : benchmark_(*benchmark),
+      family_index_(family_idx),
+      per_family_instance_index_(per_family_instance_idx),
+      aggregation_report_mode_(benchmark_.aggregation_report_mode_),
+      args_(args),
+      time_unit_(benchmark_.GetTimeUnit()),
+      measure_process_cpu_time_(benchmark_.measure_process_cpu_time_),
+      use_real_time_(benchmark_.use_real_time_),
+      use_manual_time_(benchmark_.use_manual_time_),
+      complexity_(benchmark_.complexity_),
+      complexity_lambda_(benchmark_.complexity_lambda_),
+      statistics_(benchmark_.statistics_),
+      repetitions_(benchmark_.repetitions_),
+      min_time_(benchmark_.min_time_),
+      min_warmup_time_(benchmark_.min_warmup_time_),
+      iterations_(benchmark_.iterations_),
+      threads_(thread_count) {
+  name_.function_name = benchmark_.name_;
+
+  size_t arg_i = 0;
+  for (const auto& arg : args) {
+    if (!name_.args.empty()) {
+      name_.args += '/';
+    }
+
+    if (arg_i < benchmark->arg_names_.size()) {
+      const auto& arg_name = benchmark_.arg_names_[arg_i];
+      if (!arg_name.empty()) {
+        name_.args += StrFormat("%s:", arg_name.c_str());
+      }
+    }
+
+    name_.args += StrFormat("%" PRId64, arg);
+    ++arg_i;
+  }
+
+  if (!IsZero(benchmark->min_time_)) {
+    name_.min_time = StrFormat("min_time:%0.3f", benchmark_.min_time_);
+  }
+
+  if (!IsZero(benchmark->min_warmup_time_)) {
+    name_.min_warmup_time =
+        StrFormat("min_warmup_time:%0.3f", benchmark_.min_warmup_time_);
+  }
+
+  if (benchmark_.iterations_ != 0) {
+    name_.iterations = StrFormat(
+        "iterations:%lu", static_cast<unsigned long>(benchmark_.iterations_));
+  }
+
+  if (benchmark_.repetitions_ != 0) {
+    name_.repetitions = StrFormat("repeats:%d", benchmark_.repetitions_);
+  }
+
+  if (benchmark_.measure_process_cpu_time_) {
+    name_.time_type = "process_time";
+  }
+
+  if (benchmark_.use_manual_time_) {
+    if (!name_.time_type.empty()) {
+      name_.time_type += '/';
+    }
+    name_.time_type += "manual_time";
+  } else if (benchmark_.use_real_time_) {
+    if (!name_.time_type.empty()) {
+      name_.time_type += '/';
+    }
+    name_.time_type += "real_time";
+  }
+
+  if (!benchmark_.thread_counts_.empty()) {
+    name_.threads = StrFormat("threads:%d", threads_);
+  }
+
+  setup_ = benchmark_.setup_;
+  teardown_ = benchmark_.teardown_;
+}
+
+State BenchmarkInstance::Run(
+    IterationCount iters, int thread_id, internal::ThreadTimer* timer,
+    internal::ThreadManager* manager,
+    internal::PerfCountersMeasurement* perf_counters_measurement) const {
+  State st(name_.function_name, iters, args_, thread_id, threads_, timer,
+           manager, perf_counters_measurement);
+  benchmark_.Run(st);
   return st;
 }
 
-}  // internal
-}  // benchmark
+void BenchmarkInstance::Setup() const {
+  if (setup_) {
+    State st(name_.function_name, /*iters*/ 1, args_, /*thread_id*/ 0, threads_,
+             nullptr, nullptr, nullptr);
+    setup_(st);
+  }
+}
+
+void BenchmarkInstance::Teardown() const {
+  if (teardown_) {
+    State st(name_.function_name, /*iters*/ 1, args_, /*thread_id*/ 0, threads_,
+             nullptr, nullptr, nullptr);
+    teardown_(st);
+  }
+}
+}  // namespace internal
+}  // namespace benchmark
diff --git a/ThirdParty/googlebenchmark/src/benchmark_api_internal.h b/ThirdParty/googlebenchmark/src/benchmark_api_internal.h
index 264eff95c5..94f516531b 100644
--- a/ThirdParty/googlebenchmark/src/benchmark_api_internal.h
+++ b/ThirdParty/googlebenchmark/src/benchmark_api_internal.h
@@ -1,9 +1,6 @@
 #ifndef BENCHMARK_API_INTERNAL_H
 #define BENCHMARK_API_INTERNAL_H
 
-#include "benchmark/benchmark.h"
-#include "commandlineflags.h"
-
 #include <cmath>
 #include <iosfwd>
 #include <limits>
@@ -11,32 +8,68 @@
 #include <string>
 #include <vector>
 
+#include "benchmark/benchmark.h"
+#include "commandlineflags.h"
+
 namespace benchmark {
 namespace internal {
 
 // Information kept per benchmark we may want to run
-struct BenchmarkInstance {
-  BenchmarkName name;
-  Benchmark* benchmark;
-  AggregationReportMode aggregation_report_mode;
-  std::vector<int64_t> arg;
-  TimeUnit time_unit;
-  int range_multiplier;
-  bool measure_process_cpu_time;
-  bool use_real_time;
-  bool use_manual_time;
-  BigO complexity;
-  BigOFunc* complexity_lambda;
-  UserCounters counters;
-  const std::vector<Statistics>* statistics;
-  bool last_benchmark_instance;
-  int repetitions;
-  double min_time;
-  IterationCount iterations;
-  int threads;  // Number of concurrent threads to us
+class BenchmarkInstance {
+ public:
+  BenchmarkInstance(Benchmark* benchmark, int family_index,
+                    int per_family_instance_index,
+                    const std::vector<int64_t>& args, int threads);
+
+  const BenchmarkName& name() const { return name_; }
+  int family_index() const { return family_index_; }
+  int per_family_instance_index() const { return per_family_instance_index_; }
+  AggregationReportMode aggregation_report_mode() const {
+    return aggregation_report_mode_;
+  }
+  TimeUnit time_unit() const { return time_unit_; }
+  bool measure_process_cpu_time() const { return measure_process_cpu_time_; }
+  bool use_real_time() const { return use_real_time_; }
+  bool use_manual_time() const { return use_manual_time_; }
+  BigO complexity() const { return complexity_; }
+  BigOFunc* complexity_lambda() const { return complexity_lambda_; }
+  const std::vector<Statistics>& statistics() const { return statistics_; }
+  int repetitions() const { return repetitions_; }
+  double min_time() const { return min_time_; }
+  double min_warmup_time() const { return min_warmup_time_; }
+  IterationCount iterations() const { return iterations_; }
+  int threads() const { return threads_; }
+  void Setup() const;
+  void Teardown() const;
 
   State Run(IterationCount iters, int thread_id, internal::ThreadTimer* timer,
-            internal::ThreadManager* manager) const;
+            internal::ThreadManager* manager,
+            internal::PerfCountersMeasurement* perf_counters_measurement) const;
+
+ private:
+  BenchmarkName name_;
+  Benchmark& benchmark_;
+  const int family_index_;
+  const int per_family_instance_index_;
+  AggregationReportMode aggregation_report_mode_;
+  const std::vector<int64_t>& args_;
+  TimeUnit time_unit_;
+  bool measure_process_cpu_time_;
+  bool use_real_time_;
+  bool use_manual_time_;
+  BigO complexity_;
+  BigOFunc* complexity_lambda_;
+  UserCounters counters_;
+  const std::vector<Statistics>& statistics_;
+  int repetitions_;
+  double min_time_;
+  double min_warmup_time_;
+  IterationCount iterations_;
+  int threads_;  // Number of concurrent threads to us
+
+  typedef void (*callback_function)(const benchmark::State&);
+  callback_function setup_ = nullptr;
+  callback_function teardown_ = nullptr;
 };
 
 bool FindBenchmarksInternal(const std::string& re,
@@ -45,6 +78,7 @@ bool FindBenchmarksInternal(const std::string& re,
 
 bool IsZero(double n);
 
+BENCHMARK_EXPORT
 ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color = false);
 
 }  // end namespace internal
diff --git a/ThirdParty/googlebenchmark/src/benchmark_main.cc b/ThirdParty/googlebenchmark/src/benchmark_main.cc
index b3b2478314..cd61cd2ad5 100644
--- a/ThirdParty/googlebenchmark/src/benchmark_main.cc
+++ b/ThirdParty/googlebenchmark/src/benchmark_main.cc
@@ -14,4 +14,5 @@
 
 #include "benchmark/benchmark.h"
 
+BENCHMARK_EXPORT int main(int, char**);
 BENCHMARK_MAIN();
diff --git a/ThirdParty/googlebenchmark/src/benchmark_name.cc b/ThirdParty/googlebenchmark/src/benchmark_name.cc
index 2a17ebce27..01676bbc84 100644
--- a/ThirdParty/googlebenchmark/src/benchmark_name.cc
+++ b/ThirdParty/googlebenchmark/src/benchmark_name.cc
@@ -51,8 +51,9 @@ std::string join(char delimiter, const Ts&... ts) {
 }
 }  // namespace
 
+BENCHMARK_EXPORT
 std::string BenchmarkName::str() const {
-  return join('/', function_name, args, min_time, iterations, repetitions,
-              time_type, threads);
+  return join('/', function_name, args, min_time, min_warmup_time, iterations,
+              repetitions, time_type, threads);
 }
 }  // namespace benchmark
diff --git a/ThirdParty/googlebenchmark/src/benchmark_register.cc b/ThirdParty/googlebenchmark/src/benchmark_register.cc
index 7cf841ba16..e447c9a2d3 100644
--- a/ThirdParty/googlebenchmark/src/benchmark_register.cc
+++ b/ThirdParty/googlebenchmark/src/benchmark_register.cc
@@ -15,7 +15,7 @@
 #include "benchmark_register.h"
 
 #ifndef BENCHMARK_OS_WINDOWS
-#ifndef BENCHMARK_OS_FUCHSIA
+#if !defined(BENCHMARK_OS_FUCHSIA) && !defined(BENCHMARK_OS_QURT)
 #include <sys/resource.h>
 #endif
 #include <sys/time.h>
@@ -24,6 +24,7 @@
 
 #include <algorithm>
 #include <atomic>
+#include <cinttypes>
 #include <condition_variable>
 #include <cstdio>
 #include <cstdlib>
@@ -31,11 +32,10 @@
 #include <fstream>
 #include <iostream>
 #include <memory>
+#include <numeric>
 #include <sstream>
 #include <thread>
 
-#include <cinttypes>
-
 #include "benchmark/benchmark.h"
 #include "benchmark_api_internal.h"
 #include "check.h"
@@ -53,10 +53,13 @@ namespace benchmark {
 
 namespace {
 // For non-dense Range, intermediate values are powers of kRangeMultiplier.
-static const int kRangeMultiplier = 8;
+static constexpr int kRangeMultiplier = 8;
+
 // The size of a benchmark family determines is the number of inputs to repeat
 // the benchmark on. If this is "large" then warn the user during configuration.
-static const size_t kMaxFamilySize = 100;
+static constexpr size_t kMaxFamilySize = 100;
+
+static constexpr char kDisabledPrefix[] = "DISABLED_";
 }  // end namespace
 
 namespace internal {
@@ -111,15 +114,15 @@ void BenchmarkFamilies::ClearBenchmarks() {
 bool BenchmarkFamilies::FindBenchmarks(
     std::string spec, std::vector<BenchmarkInstance>* benchmarks,
     std::ostream* ErrStream) {
-  CHECK(ErrStream);
+  BM_CHECK(ErrStream);
   auto& Err = *ErrStream;
   // Make regular expression out of command-line flag
   std::string error_msg;
   Regex re;
-  bool isNegativeFilter = false;
+  bool is_negative_filter = false;
   if (spec[0] == '-') {
     spec.replace(0, 1, "");
-    isNegativeFilter = true;
+    is_negative_filter = true;
   }
   if (!re.Init(spec, &error_msg)) {
     Err << "Could not compile benchmark re: " << error_msg << std::endl;
@@ -129,8 +132,13 @@ bool BenchmarkFamilies::FindBenchmarks(
   // Special list of thread counts to use when none are specified
   const std::vector<int> one_thread = {1};
 
+  int next_family_index = 0;
+
   MutexLock l(mutex_);
   for (std::unique_ptr<Benchmark>& family : families_) {
+    int family_index = next_family_index;
+    int per_family_instance_index = 0;
+
     // Family was deleted or benchmark doesn't match
     if (!family) continue;
 
@@ -149,85 +157,27 @@ bool BenchmarkFamilies::FindBenchmarks(
           << " will be repeated at least " << family_size << " times.\n";
     }
     // reserve in the special case the regex ".", since we know the final
-    // family size.
-    if (spec == ".") benchmarks->reserve(family_size);
+    // family size.  this doesn't take into account any disabled benchmarks
+    // so worst case we reserve more than we need.
+    if (spec == ".") benchmarks->reserve(benchmarks->size() + family_size);
 
     for (auto const& args : family->args_) {
       for (int num_threads : *thread_counts) {
-        BenchmarkInstance instance;
-        instance.name.function_name = family->name_;
-        instance.benchmark = family.get();
-        instance.aggregation_report_mode = family->aggregation_report_mode_;
-        instance.arg = args;
-        instance.time_unit = family->time_unit_;
-        instance.range_multiplier = family->range_multiplier_;
-        instance.min_time = family->min_time_;
-        instance.iterations = family->iterations_;
-        instance.repetitions = family->repetitions_;
-        instance.measure_process_cpu_time = family->measure_process_cpu_time_;
-        instance.use_real_time = family->use_real_time_;
-        instance.use_manual_time = family->use_manual_time_;
-        instance.complexity = family->complexity_;
-        instance.complexity_lambda = family->complexity_lambda_;
-        instance.statistics = &family->statistics_;
-        instance.threads = num_threads;
-
-        // Add arguments to instance name
-        size_t arg_i = 0;
-        for (auto const& arg : args) {
-          if (!instance.name.args.empty()) {
-            instance.name.args += '/';
-          }
-
-          if (arg_i < family->arg_names_.size()) {
-            const auto& arg_name = family->arg_names_[arg_i];
-            if (!arg_name.empty()) {
-              instance.name.args += StrFormat("%s:", arg_name.c_str());
-            }
-          }
-
-          instance.name.args += StrFormat("%" PRId64, arg);
-          ++arg_i;
-        }
-
-        if (!IsZero(family->min_time_))
-          instance.name.min_time =
-              StrFormat("min_time:%0.3f", family->min_time_);
-        if (family->iterations_ != 0) {
-          instance.name.iterations =
-              StrFormat("iterations:%lu",
-                        static_cast<unsigned long>(family->iterations_));
-        }
-        if (family->repetitions_ != 0)
-          instance.name.repetitions =
-              StrFormat("repeats:%d", family->repetitions_);
-
-        if (family->measure_process_cpu_time_) {
-          instance.name.time_type = "process_time";
-        }
-
-        if (family->use_manual_time_) {
-          if (!instance.name.time_type.empty()) {
-            instance.name.time_type += '/';
-          }
-          instance.name.time_type += "manual_time";
-        } else if (family->use_real_time_) {
-          if (!instance.name.time_type.empty()) {
-            instance.name.time_type += '/';
-          }
-          instance.name.time_type += "real_time";
-        }
+        BenchmarkInstance instance(family.get(), family_index,
+                                   per_family_instance_index, args,
+                                   num_threads);
+
+        const auto full_name = instance.name().str();
+        if (full_name.rfind(kDisabledPrefix, 0) != 0 &&
+            ((re.Match(full_name) && !is_negative_filter) ||
+             (!re.Match(full_name) && is_negative_filter))) {
+          benchmarks->push_back(std::move(instance));
 
-        // Add the number of threads used to the name
-        if (!family->thread_counts_.empty()) {
-          instance.name.threads = StrFormat("threads:%d", instance.threads);
-        }
+          ++per_family_instance_index;
 
-        const auto full_name = instance.name.str();
-        if ((re.Match(full_name) && !isNegativeFilter) ||
-            (!re.Match(full_name) && isNegativeFilter)) {
-          instance.last_benchmark_instance = (&args == &family->args_.back());
-          benchmarks->push_back(std::move(instance));
+          // Only bump the next family index once we've estabilished that
+          // at least one instance of this family will be run.
+          if (next_family_index == family_index) ++next_family_index;
         }
       }
     }
@@ -254,39 +204,50 @@ bool FindBenchmarksInternal(const std::string& re,
 //                               Benchmark
 //=============================================================================//
 
-Benchmark::Benchmark(const char* name)
+Benchmark::Benchmark(const std::string& name)
     : name_(name),
       aggregation_report_mode_(ARM_Unspecified),
-      time_unit_(kNanosecond),
+      time_unit_(GetDefaultTimeUnit()),
+      use_default_time_unit_(true),
       range_multiplier_(kRangeMultiplier),
       min_time_(0),
+      min_warmup_time_(0),
       iterations_(0),
       repetitions_(0),
       measure_process_cpu_time_(false),
       use_real_time_(false),
       use_manual_time_(false),
       complexity_(oNone),
-      complexity_lambda_(nullptr) {
+      complexity_lambda_(nullptr),
+      setup_(nullptr),
+      teardown_(nullptr) {
   ComputeStatistics("mean", StatisticsMean);
   ComputeStatistics("median", StatisticsMedian);
   ComputeStatistics("stddev", StatisticsStdDev);
+  ComputeStatistics("cv", StatisticsCV, kPercentage);
 }
 
 Benchmark::~Benchmark() {}
 
+Benchmark* Benchmark::Name(const std::string& name) {
+  SetName(name);
+  return this;
+}
+
 Benchmark* Benchmark::Arg(int64_t x) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
   args_.push_back({x});
   return this;
 }
 
 Benchmark* Benchmark::Unit(TimeUnit unit) {
   time_unit_ = unit;
+  use_default_time_unit_ = false;
   return this;
 }
 
 Benchmark* Benchmark::Range(int64_t start, int64_t limit) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
   std::vector<int64_t> arglist;
   AddRange(&arglist, start, limit, range_multiplier_);
 
@@ -298,53 +259,61 @@ Benchmark* Benchmark::Range(int64_t start, int64_t limit) {
 
 Benchmark* Benchmark::Ranges(
     const std::vector<std::pair<int64_t, int64_t>>& ranges) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(ranges.size()));
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(ranges.size()));
   std::vector<std::vector<int64_t>> arglists(ranges.size());
-  std::size_t total = 1;
   for (std::size_t i = 0; i < ranges.size(); i++) {
     AddRange(&arglists[i], ranges[i].first, ranges[i].second,
              range_multiplier_);
-    total *= arglists[i].size();
   }
 
-  std::vector<std::size_t> ctr(arglists.size(), 0);
-
-  for (std::size_t i = 0; i < total; i++) {
-    std::vector<int64_t> tmp;
-    tmp.reserve(arglists.size());
+  ArgsProduct(arglists);
 
-    for (std::size_t j = 0; j < arglists.size(); j++) {
-      tmp.push_back(arglists[j].at(ctr[j]));
-    }
+  return this;
+}
 
-    args_.push_back(std::move(tmp));
+Benchmark* Benchmark::ArgsProduct(
+    const std::vector<std::vector<int64_t>>& arglists) {
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(arglists.size()));
 
-    for (std::size_t j = 0; j < arglists.size(); j++) {
-      if (ctr[j] + 1 < arglists[j].size()) {
-        ++ctr[j];
-        break;
-      }
-      ctr[j] = 0;
+  std::vector<std::size_t> indices(arglists.size());
+  const std::size_t total = std::accumulate(
+      std::begin(arglists), std::end(arglists), std::size_t{1},
+      [](const std::size_t res, const std::vector<int64_t>& arglist) {
+        return res * arglist.size();
+      });
+  std::vector<int64_t> args;
+  args.reserve(arglists.size());
+  for (std::size_t i = 0; i < total; i++) {
+    for (std::size_t arg = 0; arg < arglists.size(); arg++) {
+      args.push_back(arglists[arg][indices[arg]]);
     }
+    args_.push_back(args);
+    args.clear();
+
+    std::size_t arg = 0;
+    do {
+      indices[arg] = (indices[arg] + 1) % arglists[arg].size();
+    } while (indices[arg++] == 0 && arg < arglists.size());
   }
+
   return this;
 }
 
 Benchmark* Benchmark::ArgName(const std::string& name) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
   arg_names_ = {name};
   return this;
 }
 
 Benchmark* Benchmark::ArgNames(const std::vector<std::string>& names) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(names.size()));
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(names.size()));
   arg_names_ = names;
   return this;
 }
 
 Benchmark* Benchmark::DenseRange(int64_t start, int64_t limit, int step) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
-  CHECK_LE(start, limit);
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
+  BM_CHECK_LE(start, limit);
   for (int64_t arg = start; arg <= limit; arg += step) {
     args_.push_back({arg});
   }
@@ -352,7 +321,7 @@ Benchmark* Benchmark::DenseRange(int64_t start, int64_t limit, int step) {
 }
 
 Benchmark* Benchmark::Args(const std::vector<int64_t>& args) {
-  CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(args.size()));
+  BM_CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(args.size()));
   args_.push_back(args);
   return this;
 }
@@ -362,28 +331,48 @@ Benchmark* Benchmark::Apply(void (*custom_arguments)(Benchmark* benchmark)) {
   return this;
 }
 
+Benchmark* Benchmark::Setup(void (*setup)(const benchmark::State&)) {
+  BM_CHECK(setup != nullptr);
+  setup_ = setup;
+  return this;
+}
+
+Benchmark* Benchmark::Teardown(void (*teardown)(const benchmark::State&)) {
+  BM_CHECK(teardown != nullptr);
+  teardown_ = teardown;
+  return this;
+}
+
 Benchmark* Benchmark::RangeMultiplier(int multiplier) {
-  CHECK(multiplier > 1);
+  BM_CHECK(multiplier > 1);
   range_multiplier_ = multiplier;
   return this;
 }
 
 Benchmark* Benchmark::MinTime(double t) {
-  CHECK(t > 0.0);
-  CHECK(iterations_ == 0);
+  BM_CHECK(t > 0.0);
+  BM_CHECK(iterations_ == 0);
   min_time_ = t;
   return this;
 }
 
+Benchmark* Benchmark::MinWarmUpTime(double t) {
+  BM_CHECK(t >= 0.0);
+  BM_CHECK(iterations_ == 0);
+  min_warmup_time_ = t;
+  return this;
+}
+
 Benchmark* Benchmark::Iterations(IterationCount n) {
-  CHECK(n > 0);
-  CHECK(IsZero(min_time_));
+  BM_CHECK(n > 0);
+  BM_CHECK(IsZero(min_time_));
+  BM_CHECK(IsZero(min_warmup_time_));
   iterations_ = n;
   return this;
 }
 
 Benchmark* Benchmark::Repetitions(int n) {
-  CHECK(n > 0);
+  BM_CHECK(n > 0);
   repetitions_ = n;
   return this;
 }
@@ -416,14 +405,14 @@ Benchmark* Benchmark::MeasureProcessCPUTime() {
 }
 
 Benchmark* Benchmark::UseRealTime() {
-  CHECK(!use_manual_time_)
+  BM_CHECK(!use_manual_time_)
       << "Cannot set UseRealTime and UseManualTime simultaneously.";
   use_real_time_ = true;
   return this;
 }
 
 Benchmark* Benchmark::UseManualTime() {
-  CHECK(!use_real_time_)
+  BM_CHECK(!use_real_time_)
       << "Cannot set UseRealTime and UseManualTime simultaneously.";
   use_manual_time_ = true;
   return this;
@@ -440,21 +429,22 @@ Benchmark* Benchmark::Complexity(BigOFunc* complexity) {
   return this;
 }
 
-Benchmark* Benchmark::ComputeStatistics(std::string name,
-                                        StatisticsFunc* statistics) {
-  statistics_.emplace_back(name, statistics);
+Benchmark* Benchmark::ComputeStatistics(const std::string& name,
+                                        StatisticsFunc* statistics,
+                                        StatisticUnit unit) {
+  statistics_.emplace_back(name, statistics, unit);
   return this;
 }
 
 Benchmark* Benchmark::Threads(int t) {
-  CHECK_GT(t, 0);
+  BM_CHECK_GT(t, 0);
   thread_counts_.push_back(t);
   return this;
 }
 
 Benchmark* Benchmark::ThreadRange(int min_threads, int max_threads) {
-  CHECK_GT(min_threads, 0);
-  CHECK_GE(max_threads, min_threads);
+  BM_CHECK_GT(min_threads, 0);
+  BM_CHECK_GE(max_threads, min_threads);
 
   AddRange(&thread_counts_, min_threads, max_threads, 2);
   return this;
@@ -462,9 +452,9 @@ Benchmark* Benchmark::ThreadRange(int min_threads, int max_threads) {
 
 Benchmark* Benchmark::DenseThreadRange(int min_threads, int max_threads,
                                        int stride) {
-  CHECK_GT(min_threads, 0);
-  CHECK_GE(max_threads, min_threads);
-  CHECK_GE(stride, 1);
+  BM_CHECK_GT(min_threads, 0);
+  BM_CHECK_GE(max_threads, min_threads);
+  BM_CHECK_GE(stride, 1);
 
   for (auto i = min_threads; i < max_threads; i += stride) {
     thread_counts_.push_back(i);
@@ -478,7 +468,9 @@ Benchmark* Benchmark::ThreadPerCpu() {
   return this;
 }
 
-void Benchmark::SetName(const char* name) { name_ = name; }
+void Benchmark::SetName(const std::string& name) { name_ = name; }
+
+const char* Benchmark::GetName() const { return name_.c_str(); }
 
 int Benchmark::ArgsCnt() const {
   if (args_.empty()) {
@@ -488,6 +480,16 @@ int Benchmark::ArgsCnt() const {
   return static_cast<int>(args_.front().size());
 }
 
+const char* Benchmark::GetArgName(int arg) const {
+  BM_CHECK_GE(arg, 0);
+  BM_CHECK_LT(arg, static_cast<int>(arg_names_.size()));
+  return arg_names_[arg].c_str();
+}
+
+TimeUnit Benchmark::GetTimeUnit() const {
+  return use_default_time_unit_ ? GetDefaultTimeUnit() : time_unit_;
+}
+
 //=============================================================================//
 //                            FunctionBenchmark
 //=============================================================================//
@@ -500,4 +502,19 @@ void ClearRegisteredBenchmarks() {
   internal::BenchmarkFamilies::GetInstance()->ClearBenchmarks();
 }
 
+std::vector<int64_t> CreateRange(int64_t lo, int64_t hi, int multi) {
+  std::vector<int64_t> args;
+  internal::AddRange(&args, lo, hi, multi);
+  return args;
+}
+
+std::vector<int64_t> CreateDenseRange(int64_t start, int64_t limit, int step) {
+  BM_CHECK_LE(start, limit);
+  std::vector<int64_t> args;
+  for (int64_t arg = start; arg <= limit; arg += step) {
+    args.push_back(arg);
+  }
+  return args;
+}
+
 }  // end namespace benchmark
diff --git a/ThirdParty/googlebenchmark/src/benchmark_register.h b/ThirdParty/googlebenchmark/src/benchmark_register.h
index 61377d7423..53367c707c 100644
--- a/ThirdParty/googlebenchmark/src/benchmark_register.h
+++ b/ThirdParty/googlebenchmark/src/benchmark_register.h
@@ -1,6 +1,8 @@
 #ifndef BENCHMARK_REGISTER_H
 #define BENCHMARK_REGISTER_H
 
+#include <algorithm>
+#include <limits>
 #include <vector>
 
 #include "check.h"
@@ -11,18 +13,18 @@ namespace internal {
 // Append the powers of 'mult' in the closed interval [lo, hi].
 // Returns iterator to the start of the inserted range.
 template <typename T>
-typename std::vector<T>::iterator
-AddPowers(std::vector<T>* dst, T lo, T hi, int mult) {
-  CHECK_GE(lo, 0);
-  CHECK_GE(hi, lo);
-  CHECK_GE(mult, 2);
+typename std::vector<T>::iterator AddPowers(std::vector<T>* dst, T lo, T hi,
+                                            int mult) {
+  BM_CHECK_GE(lo, 0);
+  BM_CHECK_GE(hi, lo);
+  BM_CHECK_GE(mult, 2);
 
   const size_t start_offset = dst->size();
 
   static const T kmax = std::numeric_limits<T>::max();
 
   // Space out the values in multiples of "mult"
-  for (T i = 1; i <= hi; i *= mult) {
+  for (T i = static_cast<T>(1); i <= hi; i *= static_cast<T>(mult)) {
     if (i >= lo) {
       dst->push_back(i);
     }
@@ -31,16 +33,16 @@ AddPowers(std::vector<T>* dst, T lo, T hi, int mult) {
     if (i > kmax / mult) break;
   }
 
-  return dst->begin() + start_offset;
+  return dst->begin() + static_cast<int>(start_offset);
 }
 
 template <typename T>
 void AddNegatedPowers(std::vector<T>* dst, T lo, T hi, int mult) {
   // We negate lo and hi so we require that they cannot be equal to 'min'.
-  CHECK_GT(lo, std::numeric_limits<T>::min());
-  CHECK_GT(hi, std::numeric_limits<T>::min());
-  CHECK_GE(hi, lo);
-  CHECK_LE(hi, 0);
+  BM_CHECK_GT(lo, std::numeric_limits<T>::min());
+  BM_CHECK_GT(hi, std::numeric_limits<T>::min());
+  BM_CHECK_GE(hi, lo);
+  BM_CHECK_LE(hi, 0);
 
   // Add positive powers, then negate and reverse.
   // Casts necessary since small integers get promoted
@@ -59,8 +61,8 @@ void AddRange(std::vector<T>* dst, T lo, T hi, int mult) {
   static_assert(std::is_integral<T>::value && std::is_signed<T>::value,
                 "Args type must be a signed integer");
 
-  CHECK_GE(hi, lo);
-  CHECK_GE(mult, 2);
+  BM_CHECK_GE(hi, lo);
+  BM_CHECK_GE(mult, 2);
 
   // Add "lo"
   dst->push_back(lo);
@@ -86,7 +88,7 @@ void AddRange(std::vector<T>* dst, T lo, T hi, int mult) {
   }
 
   // Treat 0 as a special case (see discussion on #762).
-  if (lo <= 0 && hi >= 0) {
+  if (lo < 0 && hi >= 0) {
     dst->push_back(0);
   }
 
diff --git a/ThirdParty/googlebenchmark/src/benchmark_runner.cc b/ThirdParty/googlebenchmark/src/benchmark_runner.cc
index 0bae6a545e..f7ae424397 100644
--- a/ThirdParty/googlebenchmark/src/benchmark_runner.cc
+++ b/ThirdParty/googlebenchmark/src/benchmark_runner.cc
@@ -13,12 +13,13 @@
 // limitations under the License.
 
 #include "benchmark_runner.h"
+
 #include "benchmark/benchmark.h"
 #include "benchmark_api_internal.h"
 #include "internal_macros.h"
 
 #ifndef BENCHMARK_OS_WINDOWS
-#ifndef BENCHMARK_OS_FUCHSIA
+#if !defined(BENCHMARK_OS_FUCHSIA) && !defined(BENCHMARK_OS_QURT)
 #include <sys/resource.h>
 #endif
 #include <sys/time.h>
@@ -27,11 +28,14 @@
 
 #include <algorithm>
 #include <atomic>
+#include <climits>
+#include <cmath>
 #include <condition_variable>
 #include <cstdio>
 #include <cstdlib>
 #include <fstream>
 #include <iostream>
+#include <limits>
 #include <memory>
 #include <string>
 #include <thread>
@@ -45,6 +49,7 @@
 #include "internal_macros.h"
 #include "log.h"
 #include "mutex.h"
+#include "perf_counters.h"
 #include "re.h"
 #include "statistics.h"
 #include "string_util.h"
@@ -60,64 +65,72 @@ MemoryManager* memory_manager = nullptr;
 namespace {
 
 static constexpr IterationCount kMaxIterations = 1000000000;
+const double kDefaultMinTime =
+    std::strtod(::benchmark::kDefaultMinTimeStr, /*p_end*/ nullptr);
 
 BenchmarkReporter::Run CreateRunReport(
     const benchmark::internal::BenchmarkInstance& b,
     const internal::ThreadManager::Result& results,
     IterationCount memory_iterations,
-    const MemoryManager::Result& memory_result, double seconds,
-    int64_t repetition_index) {
+    const MemoryManager::Result* memory_result, double seconds,
+    int64_t repetition_index, int64_t repeats) {
   // Create report about this benchmark run.
   BenchmarkReporter::Run report;
 
-  report.run_name = b.name;
-  report.error_occurred = results.has_error_;
-  report.error_message = results.error_message_;
+  report.run_name = b.name();
+  report.family_index = b.family_index();
+  report.per_family_instance_index = b.per_family_instance_index();
+  report.skipped = results.skipped_;
+  report.skip_message = results.skip_message_;
   report.report_label = results.report_label_;
   // This is the total iterations across all threads.
   report.iterations = results.iterations;
-  report.time_unit = b.time_unit;
-  report.threads = b.threads;
+  report.time_unit = b.time_unit();
+  report.threads = b.threads();
   report.repetition_index = repetition_index;
-  report.repetitions = b.repetitions;
+  report.repetitions = repeats;
 
-  if (!report.error_occurred) {
-    if (b.use_manual_time) {
+  if (!report.skipped) {
+    if (b.use_manual_time()) {
       report.real_accumulated_time = results.manual_time_used;
     } else {
       report.real_accumulated_time = results.real_time_used;
     }
     report.cpu_accumulated_time = results.cpu_time_used;
     report.complexity_n = results.complexity_n;
-    report.complexity = b.complexity;
-    report.complexity_lambda = b.complexity_lambda;
-    report.statistics = b.statistics;
+    report.complexity = b.complexity();
+    report.complexity_lambda = b.complexity_lambda();
+    report.statistics = &b.statistics();
     report.counters = results.counters;
 
     if (memory_iterations > 0) {
-      report.has_memory_result = true;
+      assert(memory_result != nullptr);
+      report.memory_result = memory_result;
       report.allocs_per_iter =
-          memory_iterations ? static_cast<double>(memory_result.num_allocs) /
+          memory_iterations ? static_cast<double>(memory_result->num_allocs) /
                                   memory_iterations
                             : 0;
-      report.max_bytes_used = memory_result.max_bytes_used;
     }
 
-    internal::Finish(&report.counters, results.iterations, seconds, b.threads);
+    internal::Finish(&report.counters, results.iterations, seconds,
+                     b.threads());
   }
   return report;
 }
 
 // Execute one thread of benchmark b for the specified number of iterations.
-// Adds the stats collected for the thread into *total.
+// Adds the stats collected for the thread into manager->results.
 void RunInThread(const BenchmarkInstance* b, IterationCount iters,
-                 int thread_id, ThreadManager* manager) {
+                 int thread_id, ThreadManager* manager,
+                 PerfCountersMeasurement* perf_counters_measurement) {
   internal::ThreadTimer timer(
-      b->measure_process_cpu_time
+      b->measure_process_cpu_time()
           ? internal::ThreadTimer::CreateProcessCpuTime()
           : internal::ThreadTimer::Create());
-  State st = b->Run(iters, thread_id, &timer, manager);
-  CHECK(st.iterations() >= st.max_iterations)
+
+  State st =
+      b->Run(iters, thread_id, &timer, manager, perf_counters_measurement);
+  BM_CHECK(st.skipped() || st.iterations() >= st.max_iterations)
       << "Benchmark returned before State::KeepRunning() returned false!";
   {
     MutexLock l(manager->GetBenchmarkMutex());
@@ -132,228 +145,351 @@ void RunInThread(const BenchmarkInstance* b, IterationCount iters,
   manager->NotifyThreadComplete();
 }
 
-class BenchmarkRunner {
- public:
-  BenchmarkRunner(const benchmark::internal::BenchmarkInstance& b_,
-                  std::vector<BenchmarkReporter::Run>* complexity_reports_)
-      : b(b_),
-        complexity_reports(*complexity_reports_),
-        min_time(!IsZero(b.min_time) ? b.min_time : FLAGS_benchmark_min_time),
-        repeats(b.repetitions != 0 ? b.repetitions
+double ComputeMinTime(const benchmark::internal::BenchmarkInstance& b,
+                      const BenchTimeType& iters_or_time) {
+  if (!IsZero(b.min_time())) return b.min_time();
+  // If the flag was used to specify number of iters, then return the default
+  // min_time.
+  if (iters_or_time.tag == BenchTimeType::ITERS) return kDefaultMinTime;
+
+  return iters_or_time.time;
+}
+
+IterationCount ComputeIters(const benchmark::internal::BenchmarkInstance& b,
+                            const BenchTimeType& iters_or_time) {
+  if (b.iterations() != 0) return b.iterations();
+
+  // We've already concluded that this flag is currently used to pass
+  // iters but do a check here again anyway.
+  BM_CHECK(iters_or_time.tag == BenchTimeType::ITERS);
+  return iters_or_time.iters;
+}
+
+}  // end namespace
+
+BenchTimeType ParseBenchMinTime(const std::string& value) {
+  BenchTimeType ret;
+
+  if (value.empty()) {
+    ret.tag = BenchTimeType::TIME;
+    ret.time = 0.0;
+    return ret;
+  }
+
+  if (value.back() == 'x') {
+    char* p_end;
+    // Reset errno before it's changed by strtol.
+    errno = 0;
+    IterationCount num_iters = std::strtol(value.c_str(), &p_end, 10);
+
+    // After a valid parse, p_end should have been set to
+    // point to the 'x' suffix.
+    BM_CHECK(errno == 0 && p_end != nullptr && *p_end == 'x')
+        << "Malformed iters value passed to --benchmark_min_time: `" << value
+        << "`. Expected --benchmark_min_time=<integer>x.";
+
+    ret.tag = BenchTimeType::ITERS;
+    ret.iters = num_iters;
+    return ret;
+  }
+
+  bool has_suffix = value.back() == 's';
+  if (!has_suffix) {
+    BM_VLOG(0) << "Value passed to --benchmark_min_time should have a suffix. "
+                  "Eg., `30s` for 30-seconds.";
+  }
+
+  char* p_end;
+  // Reset errno before it's changed by strtod.
+  errno = 0;
+  double min_time = std::strtod(value.c_str(), &p_end);
+
+  // After a successful parse, p_end should point to the suffix 's',
+  // or the end of the string if the suffix was omitted.
+  BM_CHECK(errno == 0 && p_end != nullptr &&
+           ((has_suffix && *p_end == 's') || *p_end == '\0'))
+      << "Malformed seconds value passed to --benchmark_min_time: `" << value
+      << "`. Expected --benchmark_min_time=<float>x.";
+
+  ret.tag = BenchTimeType::TIME;
+  ret.time = min_time;
+
+  return ret;
+}
+
+BenchmarkRunner::BenchmarkRunner(
+    const benchmark::internal::BenchmarkInstance& b_,
+    PerfCountersMeasurement* pcm_,
+    BenchmarkReporter::PerFamilyRunReports* reports_for_family_)
+    : b(b_),
+      reports_for_family(reports_for_family_),
+      parsed_benchtime_flag(ParseBenchMinTime(FLAGS_benchmark_min_time)),
+      min_time(ComputeMinTime(b_, parsed_benchtime_flag)),
+      min_warmup_time((!IsZero(b.min_time()) && b.min_warmup_time() > 0.0)
+                          ? b.min_warmup_time()
+                          : FLAGS_benchmark_min_warmup_time),
+      warmup_done(!(min_warmup_time > 0.0)),
+      repeats(b.repetitions() != 0 ? b.repetitions()
                                    : FLAGS_benchmark_repetitions),
-        has_explicit_iteration_count(b.iterations != 0),
-        pool(b.threads - 1),
-        iters(has_explicit_iteration_count ? b.iterations : 1) {
+      has_explicit_iteration_count(b.iterations() != 0 ||
+                                   parsed_benchtime_flag.tag ==
+                                       BenchTimeType::ITERS),
+      pool(b.threads() - 1),
+      iters(has_explicit_iteration_count
+                ? ComputeIters(b_, parsed_benchtime_flag)
+                : 1),
+      perf_counters_measurement_ptr(pcm_) {
+  run_results.display_report_aggregates_only =
+      (FLAGS_benchmark_report_aggregates_only ||
+       FLAGS_benchmark_display_aggregates_only);
+  run_results.file_report_aggregates_only =
+      FLAGS_benchmark_report_aggregates_only;
+  if (b.aggregation_report_mode() != internal::ARM_Unspecified) {
     run_results.display_report_aggregates_only =
-        (FLAGS_benchmark_report_aggregates_only ||
-         FLAGS_benchmark_display_aggregates_only);
+        (b.aggregation_report_mode() &
+         internal::ARM_DisplayReportAggregatesOnly);
     run_results.file_report_aggregates_only =
-        FLAGS_benchmark_report_aggregates_only;
-    if (b.aggregation_report_mode != internal::ARM_Unspecified) {
-      run_results.display_report_aggregates_only =
-          (b.aggregation_report_mode &
-           internal::ARM_DisplayReportAggregatesOnly);
-      run_results.file_report_aggregates_only =
-          (b.aggregation_report_mode & internal::ARM_FileReportAggregatesOnly);
-    }
+        (b.aggregation_report_mode() & internal::ARM_FileReportAggregatesOnly);
+    BM_CHECK(FLAGS_benchmark_perf_counters.empty() ||
+             (perf_counters_measurement_ptr->num_counters() == 0))
+        << "Perf counters were requested but could not be set up.";
+  }
+}
 
-    for (int repetition_num = 0; repetition_num < repeats; repetition_num++) {
-      DoOneRepetition(repetition_num);
-    }
+BenchmarkRunner::IterationResults BenchmarkRunner::DoNIterations() {
+  BM_VLOG(2) << "Running " << b.name().str() << " for " << iters << "\n";
 
-    // Calculate additional statistics
-    run_results.aggregates_only = ComputeStats(run_results.non_aggregates);
+  std::unique_ptr<internal::ThreadManager> manager;
+  manager.reset(new internal::ThreadManager(b.threads()));
 
-    // Maybe calculate complexity report
-    if ((b.complexity != oNone) && b.last_benchmark_instance) {
-      auto additional_run_stats = ComputeBigO(complexity_reports);
-      run_results.aggregates_only.insert(run_results.aggregates_only.end(),
-                                         additional_run_stats.begin(),
-                                         additional_run_stats.end());
-      complexity_reports.clear();
-    }
+  // Run all but one thread in separate threads
+  for (std::size_t ti = 0; ti < pool.size(); ++ti) {
+    pool[ti] = std::thread(&RunInThread, &b, iters, static_cast<int>(ti + 1),
+                           manager.get(), perf_counters_measurement_ptr);
   }
+  // And run one thread here directly.
+  // (If we were asked to run just one thread, we don't create new threads.)
+  // Yes, we need to do this here *after* we start the separate threads.
+  RunInThread(&b, iters, 0, manager.get(), perf_counters_measurement_ptr);
 
-  RunResults&& get_results() { return std::move(run_results); }
+  // The main thread has finished. Now let's wait for the other threads.
+  manager->WaitForAllThreads();
+  for (std::thread& thread : pool) thread.join();
 
- private:
-  RunResults run_results;
+  IterationResults i;
+  // Acquire the measurements/counters from the manager, UNDER THE LOCK!
+  {
+    MutexLock l(manager->GetBenchmarkMutex());
+    i.results = manager->results;
+  }
 
-  const benchmark::internal::BenchmarkInstance& b;
-  std::vector<BenchmarkReporter::Run>& complexity_reports;
+  // And get rid of the manager.
+  manager.reset();
 
-  const double min_time;
-  const int repeats;
-  const bool has_explicit_iteration_count;
+  // Adjust real/manual time stats since they were reported per thread.
+  i.results.real_time_used /= b.threads();
+  i.results.manual_time_used /= b.threads();
+  // If we were measuring whole-process CPU usage, adjust the CPU time too.
+  if (b.measure_process_cpu_time()) i.results.cpu_time_used /= b.threads();
 
-  std::vector<std::thread> pool;
+  BM_VLOG(2) << "Ran in " << i.results.cpu_time_used << "/"
+             << i.results.real_time_used << "\n";
 
-  IterationCount iters;  // preserved between repetitions!
-  // So only the first repetition has to find/calculate it,
-  // the other repetitions will just use that precomputed iteration count.
+  // By using KeepRunningBatch a benchmark can iterate more times than
+  // requested, so take the iteration count from i.results.
+  i.iters = i.results.iterations / b.threads();
 
-  struct IterationResults {
-    internal::ThreadManager::Result results;
-    IterationCount iters;
-    double seconds;
-  };
-  IterationResults DoNIterations() {
-    VLOG(2) << "Running " << b.name.str() << " for " << iters << "\n";
+  // Base decisions off of real time if requested by this benchmark.
+  i.seconds = i.results.cpu_time_used;
+  if (b.use_manual_time()) {
+    i.seconds = i.results.manual_time_used;
+  } else if (b.use_real_time()) {
+    i.seconds = i.results.real_time_used;
+  }
 
-    std::unique_ptr<internal::ThreadManager> manager;
-    manager.reset(new internal::ThreadManager(b.threads));
+  return i;
+}
 
-    // Run all but one thread in separate threads
-    for (std::size_t ti = 0; ti < pool.size(); ++ti) {
-      pool[ti] = std::thread(&RunInThread, &b, iters, static_cast<int>(ti + 1),
-                             manager.get());
-    }
-    // And run one thread here directly.
-    // (If we were asked to run just one thread, we don't create new threads.)
-    // Yes, we need to do this here *after* we start the separate threads.
-    RunInThread(&b, iters, 0, manager.get());
+IterationCount BenchmarkRunner::PredictNumItersNeeded(
+    const IterationResults& i) const {
+  // See how much iterations should be increased by.
+  // Note: Avoid division by zero with max(seconds, 1ns).
+  double multiplier = GetMinTimeToApply() * 1.4 / std::max(i.seconds, 1e-9);
+  // If our last run was at least 10% of FLAGS_benchmark_min_time then we
+  // use the multiplier directly.
+  // Otherwise we use at most 10 times expansion.
+  // NOTE: When the last run was at least 10% of the min time the max
+  // expansion should be 14x.
+  const bool is_significant = (i.seconds / GetMinTimeToApply()) > 0.1;
+  multiplier = is_significant ? multiplier : 10.0;
+
+  // So what seems to be the sufficiently-large iteration count? Round up.
+  const IterationCount max_next_iters = static_cast<IterationCount>(
+      std::lround(std::max(multiplier * static_cast<double>(i.iters),
+                           static_cast<double>(i.iters) + 1.0)));
+  // But we do have *some* limits though..
+  const IterationCount next_iters = std::min(max_next_iters, kMaxIterations);
+
+  BM_VLOG(3) << "Next iters: " << next_iters << ", " << multiplier << "\n";
+  return next_iters;  // round up before conversion to integer.
+}
 
-    // The main thread has finished. Now let's wait for the other threads.
-    manager->WaitForAllThreads();
-    for (std::thread& thread : pool) thread.join();
+bool BenchmarkRunner::ShouldReportIterationResults(
+    const IterationResults& i) const {
+  // Determine if this run should be reported;
+  // Either it has run for a sufficient amount of time
+  // or because an error was reported.
+  return i.results.skipped_ ||
+         i.iters >= kMaxIterations ||  // Too many iterations already.
+         i.seconds >=
+             GetMinTimeToApply() ||  // The elapsed time is large enough.
+         // CPU time is specified but the elapsed real time greatly exceeds
+         // the minimum time.
+         // Note that user provided timers are except from this test.
+         ((i.results.real_time_used >= 5 * GetMinTimeToApply()) &&
+          !b.use_manual_time());
+}
 
-    IterationResults i;
-    // Acquire the measurements/counters from the manager, UNDER THE LOCK!
-    {
-      MutexLock l(manager->GetBenchmarkMutex());
-      i.results = manager->results;
-    }
+double BenchmarkRunner::GetMinTimeToApply() const {
+  // In order to re-use functionality to run and measure benchmarks for running
+  // a warmup phase of the benchmark, we need a way of telling whether to apply
+  // min_time or min_warmup_time. This function will figure out if we are in the
+  // warmup phase and therefore need to apply min_warmup_time or if we already
+  // in the benchmarking phase and min_time needs to be applied.
+  return warmup_done ? min_time : min_warmup_time;
+}
 
-    // And get rid of the manager.
-    manager.reset();
+void BenchmarkRunner::FinishWarmUp(const IterationCount& i) {
+  warmup_done = true;
+  iters = i;
+}
 
-    // Adjust real/manual time stats since they were reported per thread.
-    i.results.real_time_used /= b.threads;
-    i.results.manual_time_used /= b.threads;
-    // If we were measuring whole-process CPU usage, adjust the CPU time too.
-    if (b.measure_process_cpu_time) i.results.cpu_time_used /= b.threads;
-
-    VLOG(2) << "Ran in " << i.results.cpu_time_used << "/"
-            << i.results.real_time_used << "\n";
-
-    // So for how long were we running?
-    i.iters = iters;
-    // Base decisions off of real time if requested by this benchmark.
-    i.seconds = i.results.cpu_time_used;
-    if (b.use_manual_time) {
-      i.seconds = i.results.manual_time_used;
-    } else if (b.use_real_time) {
-      i.seconds = i.results.real_time_used;
+void BenchmarkRunner::RunWarmUp() {
+  // Use the same mechanisms for warming up the benchmark as used for actually
+  // running and measuring the benchmark.
+  IterationResults i_warmup;
+  // Dont use the iterations determined in the warmup phase for the actual
+  // measured benchmark phase. While this may be a good starting point for the
+  // benchmark and it would therefore get rid of the need to figure out how many
+  // iterations are needed if min_time is set again, this may also be a complete
+  // wrong guess since the warmup loops might be considerably slower (e.g
+  // because of caching effects).
+  const IterationCount i_backup = iters;
+
+  for (;;) {
+    b.Setup();
+    i_warmup = DoNIterations();
+    b.Teardown();
+
+    const bool finish = ShouldReportIterationResults(i_warmup);
+
+    if (finish) {
+      FinishWarmUp(i_backup);
+      break;
     }
 
-    return i;
+    // Although we are running "only" a warmup phase where running enough
+    // iterations at once without measuring time isn't as important as it is for
+    // the benchmarking phase, we still do it the same way as otherwise it is
+    // very confusing for the user to know how to choose a proper value for
+    // min_warmup_time if a different approach on running it is used.
+    iters = PredictNumItersNeeded(i_warmup);
+    assert(iters > i_warmup.iters &&
+           "if we did more iterations than we want to do the next time, "
+           "then we should have accepted the current iteration run.");
   }
+}
 
-  IterationCount PredictNumItersNeeded(const IterationResults& i) const {
-    // See how much iterations should be increased by.
-    // Note: Avoid division by zero with max(seconds, 1ns).
-    double multiplier = min_time * 1.4 / std::max(i.seconds, 1e-9);
-    // If our last run was at least 10% of FLAGS_benchmark_min_time then we
-    // use the multiplier directly.
-    // Otherwise we use at most 10 times expansion.
-    // NOTE: When the last run was at least 10% of the min time the max
-    // expansion should be 14x.
-    bool is_significant = (i.seconds / min_time) > 0.1;
-    multiplier = is_significant ? multiplier : std::min(10.0, multiplier);
-    if (multiplier <= 1.0) multiplier = 2.0;
-
-    // So what seems to be the sufficiently-large iteration count? Round up.
-    const IterationCount max_next_iters =
-        0.5 + std::max(multiplier * i.iters, i.iters + 1.0);
-    // But we do have *some* sanity limits though..
-    const IterationCount next_iters = std::min(max_next_iters, kMaxIterations);
-
-    VLOG(3) << "Next iters: " << next_iters << ", " << multiplier << "\n";
-    return next_iters;  // round up before conversion to integer.
+void BenchmarkRunner::DoOneRepetition() {
+  assert(HasRepeatsRemaining() && "Already done all repetitions?");
+
+  const bool is_the_first_repetition = num_repetitions_done == 0;
+
+  // In case a warmup phase is requested by the benchmark, run it now.
+  // After running the warmup phase the BenchmarkRunner should be in a state as
+  // this warmup never happened except the fact that warmup_done is set. Every
+  // other manipulation of the BenchmarkRunner instance would be a bug! Please
+  // fix it.
+  if (!warmup_done) RunWarmUp();
+
+  IterationResults i;
+  // We *may* be gradually increasing the length (iteration count)
+  // of the benchmark until we decide the results are significant.
+  // And once we do, we report those last results and exit.
+  // Please do note that the if there are repetitions, the iteration count
+  // is *only* calculated for the *first* repetition, and other repetitions
+  // simply use that precomputed iteration count.
+  for (;;) {
+    b.Setup();
+    i = DoNIterations();
+    b.Teardown();
+
+    // Do we consider the results to be significant?
+    // If we are doing repetitions, and the first repetition was already done,
+    // it has calculated the correct iteration time, so we have run that very
+    // iteration count just now. No need to calculate anything. Just report.
+    // Else, the normal rules apply.
+    const bool results_are_significant = !is_the_first_repetition ||
+                                         has_explicit_iteration_count ||
+                                         ShouldReportIterationResults(i);
+
+    if (results_are_significant) break;  // Good, let's report them!
+
+    // Nope, bad iteration. Let's re-estimate the hopefully-sufficient
+    // iteration count, and run the benchmark again...
+
+    iters = PredictNumItersNeeded(i);
+    assert(iters > i.iters &&
+           "if we did more iterations than we want to do the next time, "
+           "then we should have accepted the current iteration run.");
   }
 
-  bool ShouldReportIterationResults(const IterationResults& i) const {
-    // Determine if this run should be reported;
-    // Either it has run for a sufficient amount of time
-    // or because an error was reported.
-    return i.results.has_error_ ||
-           i.iters >= kMaxIterations ||  // Too many iterations already.
-           i.seconds >= min_time ||      // The elapsed time is large enough.
-           // CPU time is specified but the elapsed real time greatly exceeds
-           // the minimum time.
-           // Note that user provided timers are except from this sanity check.
-           ((i.results.real_time_used >= 5 * min_time) && !b.use_manual_time);
+  // Oh, one last thing, we need to also produce the 'memory measurements'..
+  MemoryManager::Result* memory_result = nullptr;
+  IterationCount memory_iterations = 0;
+  if (memory_manager != nullptr) {
+    // TODO(vyng): Consider making BenchmarkReporter::Run::memory_result an
+    // optional so we don't have to own the Result here.
+    // Can't do it now due to cxx03.
+    memory_results.push_back(MemoryManager::Result());
+    memory_result = &memory_results.back();
+    // Only run a few iterations to reduce the impact of one-time
+    // allocations in benchmarks that are not properly managed.
+    memory_iterations = std::min<IterationCount>(16, iters);
+    memory_manager->Start();
+    std::unique_ptr<internal::ThreadManager> manager;
+    manager.reset(new internal::ThreadManager(1));
+    b.Setup();
+    RunInThread(&b, memory_iterations, 0, manager.get(),
+                perf_counters_measurement_ptr);
+    manager->WaitForAllThreads();
+    manager.reset();
+    b.Teardown();
+    memory_manager->Stop(*memory_result);
   }
 
-  void DoOneRepetition(int64_t repetition_index) {
-    const bool is_the_first_repetition = repetition_index == 0;
-    IterationResults i;
-
-    // We *may* be gradually increasing the length (iteration count)
-    // of the benchmark until we decide the results are significant.
-    // And once we do, we report those last results and exit.
-    // Please do note that the if there are repetitions, the iteration count
-    // is *only* calculated for the *first* repetition, and other repetitions
-    // simply use that precomputed iteration count.
-    for (;;) {
-      i = DoNIterations();
-
-      // Do we consider the results to be significant?
-      // If we are doing repetitions, and the first repetition was already done,
-      // it has calculated the correct iteration time, so we have run that very
-      // iteration count just now. No need to calculate anything. Just report.
-      // Else, the normal rules apply.
-      const bool results_are_significant = !is_the_first_repetition ||
-                                           has_explicit_iteration_count ||
-                                           ShouldReportIterationResults(i);
-
-      if (results_are_significant) break;  // Good, let's report them!
-
-      // Nope, bad iteration. Let's re-estimate the hopefully-sufficient
-      // iteration count, and run the benchmark again...
-
-      iters = PredictNumItersNeeded(i);
-      assert(iters > i.iters &&
-             "if we did more iterations than we want to do the next time, "
-             "then we should have accepted the current iteration run.");
-    }
+  // Ok, now actually report.
+  BenchmarkReporter::Run report =
+      CreateRunReport(b, i.results, memory_iterations, memory_result, i.seconds,
+                      num_repetitions_done, repeats);
 
-    // Oh, one last thing, we need to also produce the 'memory measurements'..
-    MemoryManager::Result memory_result;
-    IterationCount memory_iterations = 0;
-    if (memory_manager != nullptr) {
-      // Only run a few iterations to reduce the impact of one-time
-      // allocations in benchmarks that are not properly managed.
-      memory_iterations = std::min<IterationCount>(16, iters);
-      memory_manager->Start();
-      std::unique_ptr<internal::ThreadManager> manager;
-      manager.reset(new internal::ThreadManager(1));
-      RunInThread(&b, memory_iterations, 0, manager.get());
-      manager->WaitForAllThreads();
-      manager.reset();
-
-      memory_manager->Stop(&memory_result);
-    }
+  if (reports_for_family) {
+    ++reports_for_family->num_runs_done;
+    if (!report.skipped) reports_for_family->Runs.push_back(report);
+  }
 
-    // Ok, now actualy report.
-    BenchmarkReporter::Run report =
-        CreateRunReport(b, i.results, memory_iterations, memory_result,
-                        i.seconds, repetition_index);
+  run_results.non_aggregates.push_back(report);
 
-    if (!report.error_occurred && b.complexity != oNone)
-      complexity_reports.push_back(report);
+  ++num_repetitions_done;
+}
 
-    run_results.non_aggregates.push_back(report);
-  }
-};
+RunResults&& BenchmarkRunner::GetResults() {
+  assert(!HasRepeatsRemaining() && "Did not run all repetitions yet?");
 
-}  // end namespace
+  // Calculate additional statistics over the repetitions of this instance.
+  run_results.aggregates_only = ComputeStats(run_results.non_aggregates);
 
-RunResults RunBenchmark(
-    const benchmark::internal::BenchmarkInstance& b,
-    std::vector<BenchmarkReporter::Run>* complexity_reports) {
-  internal::BenchmarkRunner r(b, complexity_reports);
-  return r.get_results();
+  return std::move(run_results);
 }
 
 }  // end namespace internal
diff --git a/ThirdParty/googlebenchmark/src/benchmark_runner.h b/ThirdParty/googlebenchmark/src/benchmark_runner.h
index 96e8282a11..db2fa04396 100644
--- a/ThirdParty/googlebenchmark/src/benchmark_runner.h
+++ b/ThirdParty/googlebenchmark/src/benchmark_runner.h
@@ -15,19 +15,23 @@
 #ifndef BENCHMARK_RUNNER_H_
 #define BENCHMARK_RUNNER_H_
 
+#include <thread>
+#include <vector>
+
 #include "benchmark_api_internal.h"
 #include "internal_macros.h"
-
-DECLARE_double(benchmark_min_time);
-
-DECLARE_int32(benchmark_repetitions);
-
-DECLARE_bool(benchmark_report_aggregates_only);
-
-DECLARE_bool(benchmark_display_aggregates_only);
+#include "perf_counters.h"
+#include "thread_manager.h"
 
 namespace benchmark {
 
+BM_DECLARE_string(benchmark_min_time);
+BM_DECLARE_double(benchmark_min_warmup_time);
+BM_DECLARE_int32(benchmark_repetitions);
+BM_DECLARE_bool(benchmark_report_aggregates_only);
+BM_DECLARE_bool(benchmark_display_aggregates_only);
+BM_DECLARE_string(benchmark_perf_counters);
+
 namespace internal {
 
 extern MemoryManager* memory_manager;
@@ -40,9 +44,85 @@ struct RunResults {
   bool file_report_aggregates_only = false;
 };
 
-RunResults RunBenchmark(
-    const benchmark::internal::BenchmarkInstance& b,
-    std::vector<BenchmarkReporter::Run>* complexity_reports);
+struct BENCHMARK_EXPORT BenchTimeType {
+  enum { ITERS, TIME } tag;
+  union {
+    IterationCount iters;
+    double time;
+  };
+};
+
+BENCHMARK_EXPORT
+BenchTimeType ParseBenchMinTime(const std::string& value);
+
+class BenchmarkRunner {
+ public:
+  BenchmarkRunner(const benchmark::internal::BenchmarkInstance& b_,
+                  benchmark::internal::PerfCountersMeasurement* pmc_,
+                  BenchmarkReporter::PerFamilyRunReports* reports_for_family);
+
+  int GetNumRepeats() const { return repeats; }
+
+  bool HasRepeatsRemaining() const {
+    return GetNumRepeats() != num_repetitions_done;
+  }
+
+  void DoOneRepetition();
+
+  RunResults&& GetResults();
+
+  BenchmarkReporter::PerFamilyRunReports* GetReportsForFamily() const {
+    return reports_for_family;
+  }
+
+  double GetMinTime() const { return min_time; }
+
+  bool HasExplicitIters() const { return has_explicit_iteration_count; }
+
+  IterationCount GetIters() const { return iters; }
+
+ private:
+  RunResults run_results;
+
+  const benchmark::internal::BenchmarkInstance& b;
+  BenchmarkReporter::PerFamilyRunReports* reports_for_family;
+
+  BenchTimeType parsed_benchtime_flag;
+  const double min_time;
+  const double min_warmup_time;
+  bool warmup_done;
+  const int repeats;
+  const bool has_explicit_iteration_count;
+
+  int num_repetitions_done = 0;
+
+  std::vector<std::thread> pool;
+
+  std::vector<MemoryManager::Result> memory_results;
+
+  IterationCount iters;  // preserved between repetitions!
+  // So only the first repetition has to find/calculate it,
+  // the other repetitions will just use that precomputed iteration count.
+
+  PerfCountersMeasurement* const perf_counters_measurement_ptr = nullptr;
+
+  struct IterationResults {
+    internal::ThreadManager::Result results;
+    IterationCount iters;
+    double seconds;
+  };
+  IterationResults DoNIterations();
+
+  IterationCount PredictNumItersNeeded(const IterationResults& i) const;
+
+  bool ShouldReportIterationResults(const IterationResults& i) const;
+
+  double GetMinTimeToApply() const;
+
+  void FinishWarmUp(const IterationCount& i);
+
+  void RunWarmUp();
+};
 
 }  // namespace internal
 
diff --git a/ThirdParty/googlebenchmark/src/check.cc b/ThirdParty/googlebenchmark/src/check.cc
new file mode 100644
index 0000000000..5f7526e08d
--- /dev/null
+++ b/ThirdParty/googlebenchmark/src/check.cc
@@ -0,0 +1,11 @@
+#include "check.h"
+
+namespace benchmark {
+namespace internal {
+
+static AbortHandlerT* handler = &std::abort;
+
+BENCHMARK_EXPORT AbortHandlerT*& GetAbortHandler() { return handler; }
+
+}  // namespace internal
+}  // namespace benchmark
diff --git a/ThirdParty/googlebenchmark/src/check.h b/ThirdParty/googlebenchmark/src/check.h
index f5f8253f80..c1cd5e85e4 100644
--- a/ThirdParty/googlebenchmark/src/check.h
+++ b/ThirdParty/googlebenchmark/src/check.h
@@ -5,26 +5,43 @@
 #include <cstdlib>
 #include <ostream>
 
+#include "benchmark/export.h"
 #include "internal_macros.h"
 #include "log.h"
 
+#if defined(__GNUC__) || defined(__clang__)
+#define BENCHMARK_NOEXCEPT noexcept
+#define BENCHMARK_NOEXCEPT_OP(x) noexcept(x)
+#elif defined(_MSC_VER) && !defined(__clang__)
+#if _MSC_VER >= 1900
+#define BENCHMARK_NOEXCEPT noexcept
+#define BENCHMARK_NOEXCEPT_OP(x) noexcept(x)
+#else
+#define BENCHMARK_NOEXCEPT
+#define BENCHMARK_NOEXCEPT_OP(x)
+#endif
+#define __func__ __FUNCTION__
+#else
+#define BENCHMARK_NOEXCEPT
+#define BENCHMARK_NOEXCEPT_OP(x)
+#endif
+
 namespace benchmark {
 namespace internal {
 
 typedef void(AbortHandlerT)();
 
-inline AbortHandlerT*& GetAbortHandler() {
-  static AbortHandlerT* handler = &std::abort;
-  return handler;
-}
+BENCHMARK_EXPORT
+AbortHandlerT*& GetAbortHandler();
 
 BENCHMARK_NORETURN inline void CallAbortHandler() {
   GetAbortHandler()();
   std::abort();  // fallback to enforce noreturn
 }
 
-// CheckHandler is the class constructed by failing CHECK macros. CheckHandler
-// will log information about the failures and abort when it is destructed.
+// CheckHandler is the class constructed by failing BM_CHECK macros.
+// CheckHandler will log information about the failures and abort when it is
+// destructed.
 class CheckHandler {
  public:
   CheckHandler(const char* check, const char* file, const char* func, int line)
@@ -35,10 +52,17 @@ class CheckHandler {
 
   LogType& GetLog() { return log_; }
 
+#if defined(COMPILER_MSVC)
+#pragma warning(push)
+#pragma warning(disable : 4722)
+#endif
   BENCHMARK_NORETURN ~CheckHandler() BENCHMARK_NOEXCEPT_OP(false) {
     log_ << std::endl;
     CallAbortHandler();
   }
+#if defined(COMPILER_MSVC)
+#pragma warning(pop)
+#endif
 
   CheckHandler& operator=(const CheckHandler&) = delete;
   CheckHandler(const CheckHandler&) = delete;
@@ -51,32 +75,32 @@ class CheckHandler {
 }  // end namespace internal
 }  // end namespace benchmark
 
-// The CHECK macro returns a std::ostream object that can have extra information
-// written to it.
+// The BM_CHECK macro returns a std::ostream object that can have extra
+// information written to it.
 #ifndef NDEBUG
-#define CHECK(b)                                                             \
+#define BM_CHECK(b)                                                          \
   (b ? ::benchmark::internal::GetNullLogInstance()                           \
      : ::benchmark::internal::CheckHandler(#b, __FILE__, __func__, __LINE__) \
            .GetLog())
 #else
-#define CHECK(b) ::benchmark::internal::GetNullLogInstance()
+#define BM_CHECK(b) ::benchmark::internal::GetNullLogInstance()
 #endif
 
 // clang-format off
 // preserve whitespacing between operators for alignment
-#define CHECK_EQ(a, b) CHECK((a) == (b))
-#define CHECK_NE(a, b) CHECK((a) != (b))
-#define CHECK_GE(a, b) CHECK((a) >= (b))
-#define CHECK_LE(a, b) CHECK((a) <= (b))
-#define CHECK_GT(a, b) CHECK((a) > (b))
-#define CHECK_LT(a, b) CHECK((a) < (b))
-
-#define CHECK_FLOAT_EQ(a, b, eps) CHECK(std::fabs((a) - (b)) <  (eps))
-#define CHECK_FLOAT_NE(a, b, eps) CHECK(std::fabs((a) - (b)) >= (eps))
-#define CHECK_FLOAT_GE(a, b, eps) CHECK((a) - (b) > -(eps))
-#define CHECK_FLOAT_LE(a, b, eps) CHECK((b) - (a) > -(eps))
-#define CHECK_FLOAT_GT(a, b, eps) CHECK((a) - (b) >  (eps))
-#define CHECK_FLOAT_LT(a, b, eps) CHECK((b) - (a) >  (eps))
+#define BM_CHECK_EQ(a, b) BM_CHECK((a) == (b))
+#define BM_CHECK_NE(a, b) BM_CHECK((a) != (b))
+#define BM_CHECK_GE(a, b) BM_CHECK((a) >= (b))
+#define BM_CHECK_LE(a, b) BM_CHECK((a) <= (b))
+#define BM_CHECK_GT(a, b) BM_CHECK((a) > (b))
+#define BM_CHECK_LT(a, b) BM_CHECK((a) < (b))
+
+#define BM_CHECK_FLOAT_EQ(a, b, eps) BM_CHECK(std::fabs((a) - (b)) <  (eps))
+#define BM_CHECK_FLOAT_NE(a, b, eps) BM_CHECK(std::fabs((a) - (b)) >= (eps))
+#define BM_CHECK_FLOAT_GE(a, b, eps) BM_CHECK((a) - (b) > -(eps))
+#define BM_CHECK_FLOAT_LE(a, b, eps) BM_CHECK((b) - (a) > -(eps))
+#define BM_CHECK_FLOAT_GT(a, b, eps) BM_CHECK((a) - (b) >  (eps))
+#define BM_CHECK_FLOAT_LT(a, b, eps) BM_CHECK((b) - (a) >  (eps))
 //clang-format on
 
 #endif  // CHECK_H_
diff --git a/ThirdParty/googlebenchmark/src/colorprint.cc b/ThirdParty/googlebenchmark/src/colorprint.cc
index fff6a98818..0bfd67041d 100644
--- a/ThirdParty/googlebenchmark/src/colorprint.cc
+++ b/ThirdParty/googlebenchmark/src/colorprint.cc
@@ -25,8 +25,8 @@
 #include "internal_macros.h"
 
 #ifdef BENCHMARK_OS_WINDOWS
-#include <windows.h>
 #include <io.h>
+#include <windows.h>
 #else
 #include <unistd.h>
 #endif  // BENCHMARK_OS_WINDOWS
@@ -94,20 +94,20 @@ std::string FormatString(const char* msg, va_list args) {
   va_end(args_cp);
 
   // currently there is no error handling for failure, so this is hack.
-  CHECK(ret >= 0);
+  BM_CHECK(ret >= 0);
 
-  if (ret == 0)  // handle empty expansion
+  if (ret == 0) {  // handle empty expansion
     return {};
-  else if (static_cast<size_t>(ret) < size)
+  }
+  if (static_cast<size_t>(ret) < size) {
     return local_buff;
-  else {
-    // we did not provide a long enough buffer on our first attempt.
-    size = (size_t)ret + 1;  // + 1 for the null byte
-    std::unique_ptr<char[]> buff(new char[size]);
-    ret = vsnprintf(buff.get(), size, msg, args);
-    CHECK(ret > 0 && ((size_t)ret) < size);
-    return buff.get();
   }
+  // we did not provide a long enough buffer on our first attempt.
+  size = static_cast<size_t>(ret) + 1;  // + 1 for the null byte
+  std::unique_ptr<char[]> buff(new char[size]);
+  ret = vsnprintf(buff.get(), size, msg, args);
+  BM_CHECK(ret > 0 && (static_cast<size_t>(ret)) < size);
+  return buff.get();
 }
 
 std::string FormatString(const char* msg, ...) {
@@ -163,12 +163,24 @@ bool IsColorTerminal() {
 #else
   // On non-Windows platforms, we rely on the TERM variable. This list of
   // supported TERM values is copied from Google Test:
-  // <https://github.com/google/googletest/blob/master/googletest/src/gtest.cc#L2925>.
+  // <https://github.com/google/googletest/blob/v1.13.0/googletest/src/gtest.cc#L3225-L3259>.
   const char* const SUPPORTED_TERM_VALUES[] = {
-      "xterm",         "xterm-color",     "xterm-256color",
-      "screen",        "screen-256color", "tmux",
-      "tmux-256color", "rxvt-unicode",    "rxvt-unicode-256color",
-      "linux",         "cygwin",
+      "xterm",
+      "xterm-color",
+      "xterm-256color",
+      "screen",
+      "screen-256color",
+      "tmux",
+      "tmux-256color",
+      "rxvt-unicode",
+      "rxvt-unicode-256color",
+      "linux",
+      "cygwin",
+      "xterm-kitty",
+      "alacritty",
+      "foot",
+      "foot-extra",
+      "wezterm",
   };
 
   const char* const term = getenv("TERM");
diff --git a/ThirdParty/googlebenchmark/src/commandlineflags.cc b/ThirdParty/googlebenchmark/src/commandlineflags.cc
index 6bd65c5ae7..dcb414959d 100644
--- a/ThirdParty/googlebenchmark/src/commandlineflags.cc
+++ b/ThirdParty/googlebenchmark/src/commandlineflags.cc
@@ -14,11 +14,16 @@
 
 #include "commandlineflags.h"
 
+#include <algorithm>
 #include <cctype>
 #include <cstdlib>
 #include <cstring>
 #include <iostream>
 #include <limits>
+#include <map>
+#include <utility>
+
+#include "../src/string_util.h"
 
 namespace benchmark {
 namespace {
@@ -77,6 +82,30 @@ bool ParseDouble(const std::string& src_text, const char* str, double* value) {
   return true;
 }
 
+// Parses 'str' into KV pairs. If successful, writes the result to *value and
+// returns true; otherwise leaves *value unchanged and returns false.
+bool ParseKvPairs(const std::string& src_text, const char* str,
+                  std::map<std::string, std::string>* value) {
+  std::map<std::string, std::string> kvs;
+  for (const auto& kvpair : StrSplit(str, ',')) {
+    const auto kv = StrSplit(kvpair, '=');
+    if (kv.size() != 2) {
+      std::cerr << src_text << " is expected to be a comma-separated list of "
+                << "<key>=<value> strings, but actually has value \"" << str
+                << "\".\n";
+      return false;
+    }
+    if (!kvs.emplace(kv[0], kv[1]).second) {
+      std::cerr << src_text << " is expected to contain unique keys but key \""
+                << kv[0] << "\" was repeated.\n";
+      return false;
+    }
+  }
+
+  *value = kvs;
+  return true;
+}
+
 // Returns the name of the environment variable corresponding to the
 // given flag.  For example, FlagToEnvVar("foo") will return
 // "BENCHMARK_FOO" in the open-source version.
@@ -87,49 +116,64 @@ static std::string FlagToEnvVar(const char* flag) {
   for (size_t i = 0; i != flag_str.length(); ++i)
     env_var += static_cast<char>(::toupper(flag_str.c_str()[i]));
 
-  return "BENCHMARK_" + env_var;
+  return env_var;
 }
 
 }  // namespace
 
-// Reads and returns the Boolean environment variable corresponding to
-// the given flag; if it's not set, returns default_value.
-//
-// The value is considered true iff it's not "0".
-bool BoolFromEnv(const char* flag, bool default_value) {
+BENCHMARK_EXPORT
+bool BoolFromEnv(const char* flag, bool default_val) {
   const std::string env_var = FlagToEnvVar(flag);
-  const char* const string_value = getenv(env_var.c_str());
-  return string_value == nullptr ? default_value
-                                 : strcmp(string_value, "0") != 0;
+  const char* const value_str = getenv(env_var.c_str());
+  return value_str == nullptr ? default_val : IsTruthyFlagValue(value_str);
 }
 
-// Reads and returns a 32-bit integer stored in the environment
-// variable corresponding to the given flag; if it isn't set or
-// doesn't represent a valid 32-bit integer, returns default_value.
-int32_t Int32FromEnv(const char* flag, int32_t default_value) {
+BENCHMARK_EXPORT
+int32_t Int32FromEnv(const char* flag, int32_t default_val) {
   const std::string env_var = FlagToEnvVar(flag);
-  const char* const string_value = getenv(env_var.c_str());
-  if (string_value == nullptr) {
-    // The environment variable is not set.
-    return default_value;
+  const char* const value_str = getenv(env_var.c_str());
+  int32_t value = default_val;
+  if (value_str == nullptr ||
+      !ParseInt32(std::string("Environment variable ") + env_var, value_str,
+                  &value)) {
+    return default_val;
   }
+  return value;
+}
 
-  int32_t result = default_value;
-  if (!ParseInt32(std::string("Environment variable ") + env_var, string_value,
-                  &result)) {
-    std::cout << "The default value " << default_value << " is used.\n";
-    return default_value;
+BENCHMARK_EXPORT
+double DoubleFromEnv(const char* flag, double default_val) {
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const value_str = getenv(env_var.c_str());
+  double value = default_val;
+  if (value_str == nullptr ||
+      !ParseDouble(std::string("Environment variable ") + env_var, value_str,
+                   &value)) {
+    return default_val;
   }
-
-  return result;
+  return value;
 }
 
-// Reads and returns the string environment variable corresponding to
-// the given flag; if it's not set, returns default_value.
-const char* StringFromEnv(const char* flag, const char* default_value) {
+BENCHMARK_EXPORT
+const char* StringFromEnv(const char* flag, const char* default_val) {
   const std::string env_var = FlagToEnvVar(flag);
   const char* const value = getenv(env_var.c_str());
-  return value == nullptr ? default_value : value;
+  return value == nullptr ? default_val : value;
+}
+
+BENCHMARK_EXPORT
+std::map<std::string, std::string> KvPairsFromEnv(
+    const char* flag, std::map<std::string, std::string> default_val) {
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const value_str = getenv(env_var.c_str());
+
+  if (value_str == nullptr) return default_val;
+
+  std::map<std::string, std::string> value;
+  if (!ParseKvPairs("Environment variable " + env_var, value_str, &value)) {
+    return default_val;
+  }
+  return value;
 }
 
 // Parses a string as a command line flag.  The string should have
@@ -162,6 +206,7 @@ const char* ParseFlagValue(const char* str, const char* flag,
   return flag_end + 1;
 }
 
+BENCHMARK_EXPORT
 bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
   // Gets the value of the flag as a string.
   const char* const value_str = ParseFlagValue(str, flag, true);
@@ -174,6 +219,7 @@ bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
   return true;
 }
 
+BENCHMARK_EXPORT
 bool ParseInt32Flag(const char* str, const char* flag, int32_t* value) {
   // Gets the value of the flag as a string.
   const char* const value_str = ParseFlagValue(str, flag, false);
@@ -186,6 +232,7 @@ bool ParseInt32Flag(const char* str, const char* flag, int32_t* value) {
                     value);
 }
 
+BENCHMARK_EXPORT
 bool ParseDoubleFlag(const char* str, const char* flag, double* value) {
   // Gets the value of the flag as a string.
   const char* const value_str = ParseFlagValue(str, flag, false);
@@ -198,6 +245,7 @@ bool ParseDoubleFlag(const char* str, const char* flag, double* value) {
                      value);
 }
 
+BENCHMARK_EXPORT
 bool ParseStringFlag(const char* str, const char* flag, std::string* value) {
   // Gets the value of the flag as a string.
   const char* const value_str = ParseFlagValue(str, flag, false);
@@ -209,14 +257,42 @@ bool ParseStringFlag(const char* str, const char* flag, std::string* value) {
   return true;
 }
 
+BENCHMARK_EXPORT
+bool ParseKeyValueFlag(const char* str, const char* flag,
+                       std::map<std::string, std::string>* value) {
+  const char* const value_str = ParseFlagValue(str, flag, false);
+
+  if (value_str == nullptr) return false;
+
+  for (const auto& kvpair : StrSplit(value_str, ',')) {
+    const auto kv = StrSplit(kvpair, '=');
+    if (kv.size() != 2) return false;
+    value->emplace(kv[0], kv[1]);
+  }
+
+  return true;
+}
+
+BENCHMARK_EXPORT
 bool IsFlag(const char* str, const char* flag) {
   return (ParseFlagValue(str, flag, true) != nullptr);
 }
 
+BENCHMARK_EXPORT
 bool IsTruthyFlagValue(const std::string& value) {
-  if (value.empty()) return true;
-  char ch = value[0];
-  return isalnum(ch) &&
-         !(ch == '0' || ch == 'f' || ch == 'F' || ch == 'n' || ch == 'N');
+  if (value.size() == 1) {
+    char v = value[0];
+    return isalnum(v) &&
+           !(v == '0' || v == 'f' || v == 'F' || v == 'n' || v == 'N');
+  }
+  if (!value.empty()) {
+    std::string value_lower(value);
+    std::transform(value_lower.begin(), value_lower.end(), value_lower.begin(),
+                   [](char c) { return static_cast<char>(::tolower(c)); });
+    return !(value_lower == "false" || value_lower == "no" ||
+             value_lower == "off");
+  }
+  return true;
 }
+
 }  // end namespace benchmark
diff --git a/ThirdParty/googlebenchmark/src/commandlineflags.h b/ThirdParty/googlebenchmark/src/commandlineflags.h
index 5eaea82a59..7882628975 100644
--- a/ThirdParty/googlebenchmark/src/commandlineflags.h
+++ b/ThirdParty/googlebenchmark/src/commandlineflags.h
@@ -2,33 +2,80 @@
 #define BENCHMARK_COMMANDLINEFLAGS_H_
 
 #include <cstdint>
+#include <map>
 #include <string>
 
+#include "benchmark/export.h"
+
 // Macro for referencing flags.
 #define FLAG(name) FLAGS_##name
 
 // Macros for declaring flags.
-#define DECLARE_bool(name) extern bool FLAG(name)
-#define DECLARE_int32(name) extern int32_t FLAG(name)
-#define DECLARE_int64(name) extern int64_t FLAG(name)
-#define DECLARE_double(name) extern double FLAG(name)
-#define DECLARE_string(name) extern std::string FLAG(name)
+#define BM_DECLARE_bool(name) BENCHMARK_EXPORT extern bool FLAG(name)
+#define BM_DECLARE_int32(name) BENCHMARK_EXPORT extern int32_t FLAG(name)
+#define BM_DECLARE_double(name) BENCHMARK_EXPORT extern double FLAG(name)
+#define BM_DECLARE_string(name) BENCHMARK_EXPORT extern std::string FLAG(name)
+#define BM_DECLARE_kvpairs(name) \
+  BENCHMARK_EXPORT extern std::map<std::string, std::string> FLAG(name)
 
 // Macros for defining flags.
-#define DEFINE_bool(name, default_val, doc) bool FLAG(name) = (default_val)
-#define DEFINE_int32(name, default_val, doc) int32_t FLAG(name) = (default_val)
-#define DEFINE_int64(name, default_val, doc) int64_t FLAG(name) = (default_val)
-#define DEFINE_double(name, default_val, doc) double FLAG(name) = (default_val)
-#define DEFINE_string(name, default_val, doc) \
-  std::string FLAG(name) = (default_val)
+#define BM_DEFINE_bool(name, default_val) \
+  BENCHMARK_EXPORT bool FLAG(name) = benchmark::BoolFromEnv(#name, default_val)
+#define BM_DEFINE_int32(name, default_val) \
+  BENCHMARK_EXPORT int32_t FLAG(name) =    \
+      benchmark::Int32FromEnv(#name, default_val)
+#define BM_DEFINE_double(name, default_val) \
+  BENCHMARK_EXPORT double FLAG(name) =      \
+      benchmark::DoubleFromEnv(#name, default_val)
+#define BM_DEFINE_string(name, default_val) \
+  BENCHMARK_EXPORT std::string FLAG(name) = \
+      benchmark::StringFromEnv(#name, default_val)
+#define BM_DEFINE_kvpairs(name, default_val)                       \
+  BENCHMARK_EXPORT std::map<std::string, std::string> FLAG(name) = \
+      benchmark::KvPairsFromEnv(#name, default_val)
 
 namespace benchmark {
-// Parses a bool/Int32/string from the environment variable
-// corresponding to the given Google Test flag.
+
+// Parses a bool from the environment variable corresponding to the given flag.
+//
+// If the variable exists, returns IsTruthyFlagValue() value;  if not,
+// returns the given default value.
+BENCHMARK_EXPORT
 bool BoolFromEnv(const char* flag, bool default_val);
+
+// Parses an Int32 from the environment variable corresponding to the given
+// flag.
+//
+// If the variable exists, returns ParseInt32() value;  if not, returns
+// the given default value.
+BENCHMARK_EXPORT
 int32_t Int32FromEnv(const char* flag, int32_t default_val);
+
+// Parses an Double from the environment variable corresponding to the given
+// flag.
+//
+// If the variable exists, returns ParseDouble();  if not, returns
+// the given default value.
+BENCHMARK_EXPORT
+double DoubleFromEnv(const char* flag, double default_val);
+
+// Parses a string from the environment variable corresponding to the given
+// flag.
+//
+// If variable exists, returns its value;  if not, returns
+// the given default value.
+BENCHMARK_EXPORT
 const char* StringFromEnv(const char* flag, const char* default_val);
 
+// Parses a set of kvpairs from the environment variable corresponding to the
+// given flag.
+//
+// If variable exists, returns its value;  if not, returns
+// the given default value.
+BENCHMARK_EXPORT
+std::map<std::string, std::string> KvPairsFromEnv(
+    const char* flag, std::map<std::string, std::string> default_val);
+
 // Parses a string for a bool flag, in the form of either
 // "--flag=value" or "--flag".
 //
@@ -38,36 +85,49 @@ const char* StringFromEnv(const char* flag, const char* default_val);
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
+BENCHMARK_EXPORT
 bool ParseBoolFlag(const char* str, const char* flag, bool* value);
 
-// Parses a string for an Int32 flag, in the form of
-// "--flag=value".
+// Parses a string for an Int32 flag, in the form of "--flag=value".
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
+BENCHMARK_EXPORT
 bool ParseInt32Flag(const char* str, const char* flag, int32_t* value);
 
-// Parses a string for a Double flag, in the form of
-// "--flag=value".
+// Parses a string for a Double flag, in the form of "--flag=value".
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
+BENCHMARK_EXPORT
 bool ParseDoubleFlag(const char* str, const char* flag, double* value);
 
-// Parses a string for a string flag, in the form of
-// "--flag=value".
+// Parses a string for a string flag, in the form of "--flag=value".
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
+BENCHMARK_EXPORT
 bool ParseStringFlag(const char* str, const char* flag, std::string* value);
 
+// Parses a string for a kvpairs flag in the form "--flag=key=value,key=value"
+//
+// On success, stores the value of the flag in *value and returns true. On
+// failure returns false, though *value may have been mutated.
+BENCHMARK_EXPORT
+bool ParseKeyValueFlag(const char* str, const char* flag,
+                       std::map<std::string, std::string>* value);
+
 // Returns true if the string matches the flag.
+BENCHMARK_EXPORT
 bool IsFlag(const char* str, const char* flag);
 
 // Returns true unless value starts with one of: '0', 'f', 'F', 'n' or 'N', or
-// some non-alphanumeric character. As a special case, also returns true if
-// value is the empty string.
+// some non-alphanumeric character. Also returns false if the value matches
+// one of 'no', 'false', 'off' (case-insensitive). As a special case, also
+// returns true if value is the empty string.
+BENCHMARK_EXPORT
 bool IsTruthyFlagValue(const std::string& value);
+
 }  // end namespace benchmark
 
 #endif  // BENCHMARK_COMMANDLINEFLAGS_H_
diff --git a/ThirdParty/googlebenchmark/src/complexity.cc b/ThirdParty/googlebenchmark/src/complexity.cc
index 79e00c64e1..825c57394a 100644
--- a/ThirdParty/googlebenchmark/src/complexity.cc
+++ b/ThirdParty/googlebenchmark/src/complexity.cc
@@ -15,12 +15,13 @@
 // Source project : https://github.com/ismaelJimenez/cpp.leastsq
 // Adapted to be used with google benchmark
 
-#include "benchmark/benchmark.h"
+#include "complexity.h"
 
 #include <algorithm>
 #include <cmath>
+
+#include "benchmark/benchmark.h"
 #include "check.h"
-#include "complexity.h"
 
 namespace benchmark {
 
@@ -123,10 +124,10 @@ LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
 //                  fitting curve.
 LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
                        const std::vector<double>& time, const BigO complexity) {
-  CHECK_EQ(n.size(), time.size());
-  CHECK_GE(n.size(), 2);  // Do not compute fitting curve is less than two
-                          // benchmark runs are given
-  CHECK_NE(complexity, oNone);
+  BM_CHECK_EQ(n.size(), time.size());
+  BM_CHECK_GE(n.size(), 2);  // Do not compute fitting curve is less than two
+                             // benchmark runs are given
+  BM_CHECK_NE(complexity, oNone);
 
   LeastSq best_fit;
 
@@ -167,7 +168,8 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
 
   // Populate the accumulators.
   for (const Run& run : reports) {
-    CHECK_GT(run.complexity_n, 0) << "Did you forget to call SetComplexityN?";
+    BM_CHECK_GT(run.complexity_n, 0)
+        << "Did you forget to call SetComplexityN?";
     n.push_back(run.complexity_n);
     real_time.push_back(run.real_accumulated_time / run.iterations);
     cpu_time.push_back(run.cpu_accumulated_time / run.iterations);
@@ -191,11 +193,14 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
   // Get the data from the accumulator to BenchmarkReporter::Run's.
   Run big_o;
   big_o.run_name = run_name;
+  big_o.family_index = reports[0].family_index;
+  big_o.per_family_instance_index = reports[0].per_family_instance_index;
   big_o.run_type = BenchmarkReporter::Run::RT_Aggregate;
   big_o.repetitions = reports[0].repetitions;
   big_o.repetition_index = Run::no_repetition_index;
   big_o.threads = reports[0].threads;
   big_o.aggregate_name = "BigO";
+  big_o.aggregate_unit = StatisticUnit::kTime;
   big_o.report_label = reports[0].report_label;
   big_o.iterations = 0;
   big_o.real_accumulated_time = result_real.coef;
@@ -213,8 +218,11 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
   // Only add label to mean/stddev if it is same for all runs
   Run rms;
   rms.run_name = run_name;
+  rms.family_index = reports[0].family_index;
+  rms.per_family_instance_index = reports[0].per_family_instance_index;
   rms.run_type = BenchmarkReporter::Run::RT_Aggregate;
   rms.aggregate_name = "RMS";
+  rms.aggregate_unit = StatisticUnit::kPercentage;
   rms.report_label = big_o.report_label;
   rms.iterations = 0;
   rms.repetition_index = Run::no_repetition_index;
diff --git a/ThirdParty/googlebenchmark/src/complexity.h b/ThirdParty/googlebenchmark/src/complexity.h
index df29b48d29..0a0679b48b 100644
--- a/ThirdParty/googlebenchmark/src/complexity.h
+++ b/ThirdParty/googlebenchmark/src/complexity.h
@@ -31,7 +31,7 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
     const std::vector<BenchmarkReporter::Run>& reports);
 
 // This data structure will contain the result returned by MinimalLeastSq
-//   - coef        : Estimated coeficient for the high-order term as
+//   - coef        : Estimated coefficient for the high-order term as
 //                   interpolated from data.
 //   - rms         : Normalized Root Mean Squared Error.
 //   - complexity  : Scalability form (e.g. oN, oNLogN). In case a scalability
diff --git a/ThirdParty/googlebenchmark/src/console_reporter.cc b/ThirdParty/googlebenchmark/src/console_reporter.cc
index cc8ae276f6..10e05e133e 100644
--- a/ThirdParty/googlebenchmark/src/console_reporter.cc
+++ b/ThirdParty/googlebenchmark/src/console_reporter.cc
@@ -12,27 +12,28 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "benchmark/benchmark.h"
-#include "complexity.h"
-#include "counter.h"
-
 #include <algorithm>
 #include <cstdint>
 #include <cstdio>
+#include <cstring>
 #include <iostream>
 #include <string>
 #include <tuple>
 #include <vector>
 
+#include "benchmark/benchmark.h"
 #include "check.h"
 #include "colorprint.h"
 #include "commandlineflags.h"
+#include "complexity.h"
+#include "counter.h"
 #include "internal_macros.h"
 #include "string_util.h"
 #include "timers.h"
 
 namespace benchmark {
 
+BENCHMARK_EXPORT
 bool ConsoleReporter::ReportContext(const Context& context) {
   name_field_width_ = context.name_field_width;
   printed_header_ = false;
@@ -45,19 +46,21 @@ bool ConsoleReporter::ReportContext(const Context& context) {
     GetErrorStream()
         << "Color printing is only supported for stdout on windows."
            " Disabling color printing\n";
-    output_options_ = static_cast< OutputOptions >(output_options_ & ~OO_Color);
+    output_options_ = static_cast<OutputOptions>(output_options_ & ~OO_Color);
   }
 #endif
 
   return true;
 }
 
+BENCHMARK_EXPORT
 void ConsoleReporter::PrintHeader(const Run& run) {
-  std::string str = FormatString("%-*s %13s %15s %12s", static_cast<int>(name_field_width_),
-                                 "Benchmark", "Time", "CPU", "Iterations");
-  if(!run.counters.empty()) {
-    if(output_options_ & OO_Tabular) {
-      for(auto const& c : run.counters) {
+  std::string str =
+      FormatString("%-*s %13s %15s %12s", static_cast<int>(name_field_width_),
+                   "Benchmark", "Time", "CPU", "Iterations");
+  if (!run.counters.empty()) {
+    if (output_options_ & OO_Tabular) {
+      for (auto const& c : run.counters) {
         str += FormatString(" %10s", c.first.c_str());
       }
     } else {
@@ -68,6 +71,7 @@ void ConsoleReporter::PrintHeader(const Run& run) {
   GetOutputStream() << line << "\n" << str << "\n" << line << "\n";
 }
 
+BENCHMARK_EXPORT
 void ConsoleReporter::ReportRuns(const std::vector<Run>& reports) {
   for (const auto& run : reports) {
     // print the header:
@@ -97,8 +101,10 @@ static void IgnoreColorPrint(std::ostream& out, LogColor, const char* fmt,
   va_end(args);
 }
 
-
 static std::string FormatTime(double time) {
+  // For the time columns of the console printer 13 digits are reserved. One of
+  // them is a space and max two of them are the time unit (e.g ns). That puts
+  // us at 10 digits usable for the number.
   // Align decimal places...
   if (time < 1.0) {
     return FormatString("%10.3f", time);
@@ -109,22 +115,33 @@ static std::string FormatTime(double time) {
   if (time < 100.0) {
     return FormatString("%10.1f", time);
   }
+  // Assuming the time is at max 9.9999e+99 and we have 10 digits for the
+  // number, we get 10-1(.)-1(e)-1(sign)-2(exponent) = 5 digits to print.
+  if (time > 9999999999 /*max 10 digit number*/) {
+    return FormatString("%1.4e", time);
+  }
   return FormatString("%10.0f", time);
 }
 
+BENCHMARK_EXPORT
 void ConsoleReporter::PrintRunData(const Run& result) {
   typedef void(PrinterFn)(std::ostream&, LogColor, const char*, ...);
   auto& Out = GetOutputStream();
-  PrinterFn* printer = (output_options_ & OO_Color) ?
-                         (PrinterFn*)ColorPrintf : IgnoreColorPrint;
+  PrinterFn* printer = (output_options_ & OO_Color)
+                           ? static_cast<PrinterFn*>(ColorPrintf)
+                           : IgnoreColorPrint;
   auto name_color =
       (result.report_big_o || result.report_rms) ? COLOR_BLUE : COLOR_GREEN;
   printer(Out, name_color, "%-*s ", name_field_width_,
           result.benchmark_name().c_str());
 
-  if (result.error_occurred) {
+  if (internal::SkippedWithError == result.skipped) {
     printer(Out, COLOR_RED, "ERROR OCCURRED: \'%s\'",
-            result.error_message.c_str());
+            result.skip_message.c_str());
+    printer(Out, COLOR_DEFAULT, "\n");
+    return;
+  } else if (internal::SkippedWithMessage == result.skipped) {
+    printer(Out, COLOR_WHITE, "SKIPPED: \'%s\'", result.skip_message.c_str());
     printer(Out, COLOR_DEFAULT, "\n");
     return;
   }
@@ -134,18 +151,23 @@ void ConsoleReporter::PrintRunData(const Run& result) {
   const std::string real_time_str = FormatTime(real_time);
   const std::string cpu_time_str = FormatTime(cpu_time);
 
-
   if (result.report_big_o) {
     std::string big_o = GetBigOString(result.complexity);
-    printer(Out, COLOR_YELLOW, "%10.2f %-4s %10.2f %-4s ", real_time, big_o.c_str(),
-            cpu_time, big_o.c_str());
+    printer(Out, COLOR_YELLOW, "%10.2f %-4s %10.2f %-4s ", real_time,
+            big_o.c_str(), cpu_time, big_o.c_str());
   } else if (result.report_rms) {
     printer(Out, COLOR_YELLOW, "%10.0f %-4s %10.0f %-4s ", real_time * 100, "%",
             cpu_time * 100, "%");
-  } else {
+  } else if (result.run_type != Run::RT_Aggregate ||
+             result.aggregate_unit == StatisticUnit::kTime) {
     const char* timeLabel = GetTimeUnitString(result.time_unit);
-    printer(Out, COLOR_YELLOW, "%s %-4s %s %-4s ", real_time_str.c_str(), timeLabel,
-            cpu_time_str.c_str(), timeLabel);
+    printer(Out, COLOR_YELLOW, "%s %-4s %s %-4s ", real_time_str.c_str(),
+            timeLabel, cpu_time_str.c_str(), timeLabel);
+  } else {
+    assert(result.aggregate_unit == StatisticUnit::kPercentage);
+    printer(Out, COLOR_YELLOW, "%10.2f %-4s %10.2f %-4s ",
+            (100. * result.real_accumulated_time), "%",
+            (100. * result.cpu_accumulated_time), "%");
   }
 
   if (!result.report_big_o && !result.report_rms) {
@@ -153,19 +175,24 @@ void ConsoleReporter::PrintRunData(const Run& result) {
   }
 
   for (auto& c : result.counters) {
-    const std::size_t cNameLen = std::max(std::string::size_type(10),
-                                          c.first.length());
-    auto const& s = HumanReadableNumber(c.second.value, c.second.oneK);
-    if (output_options_ & OO_Tabular) {
-      if (c.second.flags & Counter::kIsRate) {
-        printer(Out, COLOR_DEFAULT, " %*s/s", cNameLen - 2, s.c_str());
-      } else {
-        printer(Out, COLOR_DEFAULT, " %*s", cNameLen, s.c_str());
-      }
+    const std::size_t cNameLen =
+        std::max(std::string::size_type(10), c.first.length());
+    std::string s;
+    const char* unit = "";
+    if (result.run_type == Run::RT_Aggregate &&
+        result.aggregate_unit == StatisticUnit::kPercentage) {
+      s = StrFormat("%.2f", 100. * c.second.value);
+      unit = "%";
     } else {
-      const char* unit = (c.second.flags & Counter::kIsRate) ? "/s" : "";
-      printer(Out, COLOR_DEFAULT, " %s=%s%s", c.first.c_str(), s.c_str(),
+      s = HumanReadableNumber(c.second.value, c.second.oneK);
+      if (c.second.flags & Counter::kIsRate)
+        unit = (c.second.flags & Counter::kInvert) ? "s" : "/s";
+    }
+    if (output_options_ & OO_Tabular) {
+      printer(Out, COLOR_DEFAULT, " %*s%s", cNameLen - strlen(unit), s.c_str(),
               unit);
+    } else {
+      printer(Out, COLOR_DEFAULT, " %s=%s%s", c.first.c_str(), s.c_str(), unit);
     }
   }
 
diff --git a/ThirdParty/googlebenchmark/src/counter.cc b/ThirdParty/googlebenchmark/src/counter.cc
index c248ea110b..cf5b78ee3a 100644
--- a/ThirdParty/googlebenchmark/src/counter.cc
+++ b/ThirdParty/googlebenchmark/src/counter.cc
@@ -32,6 +32,10 @@ double Finish(Counter const& c, IterationCount iterations, double cpu_time,
   if (c.flags & Counter::kAvgIterations) {
     v /= iterations;
   }
+
+  if (c.flags & Counter::kInvert) {  // Invert is *always* last.
+    v = 1.0 / v;
+  }
   return v;
 }
 
diff --git a/ThirdParty/googlebenchmark/src/counter.h b/ThirdParty/googlebenchmark/src/counter.h
index 1ad46d4940..1f5a58e31f 100644
--- a/ThirdParty/googlebenchmark/src/counter.h
+++ b/ThirdParty/googlebenchmark/src/counter.h
@@ -12,6 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#ifndef BENCHMARK_COUNTER_H_
+#define BENCHMARK_COUNTER_H_
+
 #include "benchmark/benchmark.h"
 
 namespace benchmark {
@@ -25,3 +28,5 @@ bool SameNames(UserCounters const& l, UserCounters const& r);
 }  // end namespace internal
 
 }  // end namespace benchmark
+
+#endif  // BENCHMARK_COUNTER_H_
diff --git a/ThirdParty/googlebenchmark/src/csv_reporter.cc b/ThirdParty/googlebenchmark/src/csv_reporter.cc
index af2c18fc8a..7b56da107e 100644
--- a/ThirdParty/googlebenchmark/src/csv_reporter.cc
+++ b/ThirdParty/googlebenchmark/src/csv_reporter.cc
@@ -12,9 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "benchmark/benchmark.h"
-#include "complexity.h"
-
 #include <algorithm>
 #include <cstdint>
 #include <iostream>
@@ -22,7 +19,9 @@
 #include <tuple>
 #include <vector>
 
+#include "benchmark/benchmark.h"
 #include "check.h"
+#include "complexity.h"
 #include "string_util.h"
 #include "timers.h"
 
@@ -37,23 +36,29 @@ std::vector<std::string> elements = {
     "error_occurred", "error_message"};
 }  // namespace
 
-std::string CsvEscape(const std::string & s) {
+std::string CsvEscape(const std::string& s) {
   std::string tmp;
   tmp.reserve(s.size() + 2);
   for (char c : s) {
     switch (c) {
-    case '"' : tmp += "\"\""; break;
-    default  : tmp += c; break;
+      case '"':
+        tmp += "\"\"";
+        break;
+      default:
+        tmp += c;
+        break;
     }
   }
   return '"' + tmp + '"';
 }
 
+BENCHMARK_EXPORT
 bool CSVReporter::ReportContext(const Context& context) {
   PrintBasicContext(&GetErrorStream(), context);
   return true;
 }
 
+BENCHMARK_EXPORT
 void CSVReporter::ReportRuns(const std::vector<Run>& reports) {
   std::ostream& Out = GetOutputStream();
 
@@ -85,7 +90,8 @@ void CSVReporter::ReportRuns(const std::vector<Run>& reports) {
       for (const auto& cnt : run.counters) {
         if (cnt.first == "bytes_per_second" || cnt.first == "items_per_second")
           continue;
-        CHECK(user_counter_names_.find(cnt.first) != user_counter_names_.end())
+        BM_CHECK(user_counter_names_.find(cnt.first) !=
+                 user_counter_names_.end())
             << "All counters must be present in each run. "
             << "Counter named \"" << cnt.first
             << "\" was not in a run after being added to the header";
@@ -99,13 +105,14 @@ void CSVReporter::ReportRuns(const std::vector<Run>& reports) {
   }
 }
 
+BENCHMARK_EXPORT
 void CSVReporter::PrintRunData(const Run& run) {
   std::ostream& Out = GetOutputStream();
   Out << CsvEscape(run.benchmark_name()) << ",";
-  if (run.error_occurred) {
+  if (run.skipped) {
     Out << std::string(elements.size() - 3, ',');
-    Out << "true,";
-    Out << CsvEscape(run.error_message) << "\n";
+    Out << std::boolalpha << (internal::SkippedWithError == run.skipped) << ",";
+    Out << CsvEscape(run.skip_message) << "\n";
     return;
   }
 
diff --git a/ThirdParty/googlebenchmark/src/cycleclock.h b/ThirdParty/googlebenchmark/src/cycleclock.h
index f5e37b011b..ae1ef2d2d2 100644
--- a/ThirdParty/googlebenchmark/src/cycleclock.h
+++ b/ThirdParty/googlebenchmark/src/cycleclock.h
@@ -36,7 +36,8 @@
 // declarations of some other intrinsics, breaking compilation.
 // Therefore, we simply declare __rdtsc ourselves. See also
 // http://connect.microsoft.com/VisualStudio/feedback/details/262047
-#if defined(COMPILER_MSVC) && !defined(_M_IX86)
+#if defined(COMPILER_MSVC) && !defined(_M_IX86) && !defined(_M_ARM64) && \
+    !defined(_M_ARM64EC)
 extern "C" uint64_t __rdtsc();
 #pragma intrinsic(__rdtsc)
 #endif
@@ -84,13 +85,21 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
   return (high << 32) | low;
 #elif defined(__powerpc__) || defined(__ppc__)
   // This returns a time-base, which is not always precisely a cycle-count.
-  int64_t tbl, tbu0, tbu1;
-  asm("mftbu %0" : "=r"(tbu0));
-  asm("mftb  %0" : "=r"(tbl));
-  asm("mftbu %0" : "=r"(tbu1));
-  tbl &= -static_cast<int64_t>(tbu0 == tbu1);
-  // high 32 bits in tbu1; low 32 bits in tbl  (tbu0 is garbage)
-  return (tbu1 << 32) | tbl;
+#if defined(__powerpc64__) || defined(__ppc64__)
+  int64_t tb;
+  asm volatile("mfspr %0, 268" : "=r"(tb));
+  return tb;
+#else
+  uint32_t tbl, tbu0, tbu1;
+  asm volatile(
+      "mftbu %0\n"
+      "mftb %1\n"
+      "mftbu %2"
+      : "=r"(tbu0), "=r"(tbl), "=r"(tbu1));
+  tbl &= -static_cast<int32_t>(tbu0 == tbu1);
+  // high 32 bits in tbu1; low 32 bits in tbl  (tbu0 is no longer needed)
+  return (static_cast<uint64_t>(tbu1) << 32) | tbl;
+#endif
 #elif defined(__sparc__)
   int64_t tick;
   asm(".byte 0x83, 0x41, 0x00, 0x00");
@@ -106,6 +115,12 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
   // when I know it will work.  Otherwise, I'll use __rdtsc and hope
   // the code is being compiled with a non-ancient compiler.
   _asm rdtsc
+#elif defined(COMPILER_MSVC) && (defined(_M_ARM64) || defined(_M_ARM64EC))
+  // See // https://docs.microsoft.com/en-us/cpp/intrinsics/arm64-intrinsics
+  // and https://reviews.llvm.org/D53115
+  int64_t virtual_timer_value;
+  virtual_timer_value = _ReadStatusReg(ARM64_CNTVCT);
+  return virtual_timer_value;
 #elif defined(COMPILER_MSVC)
   return __rdtsc();
 #elif defined(BENCHMARK_OS_NACL)
@@ -118,7 +133,7 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
 
   // Native Client does not provide any API to access cycle counter.
   // Use clock_gettime(CLOCK_MONOTONIC, ...) instead of gettimeofday
-  // because is provides nanosecond resolution (which is noticable at
+  // because is provides nanosecond resolution (which is noticeable at
   // least for PNaCl modules running on x86 Mac & Linux).
   // Initialize to always return 0 if clock_gettime fails.
   struct timespec ts = {0, 0};
@@ -153,17 +168,55 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
   struct timeval tv;
   gettimeofday(&tv, nullptr);
   return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
-#elif defined(__mips__)
+#elif defined(__mips__) || defined(__m68k__)
   // mips apparently only allows rdtsc for superusers, so we fall
   // back to gettimeofday.  It's possible clock_gettime would be better.
   struct timeval tv;
   gettimeofday(&tv, nullptr);
   return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+#elif defined(__loongarch__) || defined(__csky__)
+  struct timeval tv;
+  gettimeofday(&tv, nullptr);
+  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
 #elif defined(__s390__)  // Covers both s390 and s390x.
   // Return the CPU clock.
   uint64_t tsc;
+#if defined(BENCHMARK_OS_ZOS) && defined(COMPILER_IBMXL)
+  // z/OS XL compiler HLASM syntax.
+  asm(" stck %0" : "=m"(tsc) : : "cc");
+#else
   asm("stck %0" : "=Q"(tsc) : : "cc");
+#endif
   return tsc;
+#elif defined(__riscv)  // RISC-V
+  // Use RDCYCLE (and RDCYCLEH on riscv32)
+#if __riscv_xlen == 32
+  uint32_t cycles_lo, cycles_hi0, cycles_hi1;
+  // This asm also includes the PowerPC overflow handling strategy, as above.
+  // Implemented in assembly because Clang insisted on branching.
+  asm volatile(
+      "rdcycleh %0\n"
+      "rdcycle %1\n"
+      "rdcycleh %2\n"
+      "sub %0, %0, %2\n"
+      "seqz %0, %0\n"
+      "sub %0, zero, %0\n"
+      "and %1, %1, %0\n"
+      : "=r"(cycles_hi0), "=r"(cycles_lo), "=r"(cycles_hi1));
+  return (static_cast<uint64_t>(cycles_hi1) << 32) | cycles_lo;
+#else
+  uint64_t cycles;
+  asm volatile("rdcycle %0" : "=r"(cycles));
+  return cycles;
+#endif
+#elif defined(__e2k__) || defined(__elbrus__)
+  struct timeval tv;
+  gettimeofday(&tv, nullptr);
+  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+#elif defined(__hexagon__)
+  uint64_t pcycle;
+  asm volatile("%0 = C15:14" : "=r"(pcycle));
+  return static_cast<double>(pcycle);
 #else
 // The soft failover to a generic implementation is automatic only for ARM.
 // For other platforms the developer is expected to make an attempt to create
diff --git a/ThirdParty/googlebenchmark/src/internal_macros.h b/ThirdParty/googlebenchmark/src/internal_macros.h
index 6adf00d056..8dd7d0c650 100644
--- a/ThirdParty/googlebenchmark/src/internal_macros.h
+++ b/ThirdParty/googlebenchmark/src/internal_macros.h
@@ -1,8 +1,6 @@
 #ifndef BENCHMARK_INTERNAL_MACROS_H_
 #define BENCHMARK_INTERNAL_MACROS_H_
 
-#include "benchmark/benchmark.h"
-
 /* Needed to detect STL */
 #include <cstdlib>
 
@@ -13,7 +11,11 @@
 #endif
 
 #if defined(__clang__)
-  #if !defined(COMPILER_CLANG)
+  #if defined(__ibmxl__)
+    #if !defined(COMPILER_IBMXL)
+      #define COMPILER_IBMXL
+    #endif
+  #elif !defined(COMPILER_CLANG)
     #define COMPILER_CLANG
   #endif
 #elif defined(_MSC_VER)
@@ -40,6 +42,19 @@
   #define BENCHMARK_OS_CYGWIN 1
 #elif defined(_WIN32)
   #define BENCHMARK_OS_WINDOWS 1
+  // WINAPI_FAMILY_PARTITION is defined in winapifamily.h.
+  // We include windows.h which implicitly includes winapifamily.h for compatibility.
+  #ifndef NOMINMAX
+    #define NOMINMAX
+  #endif
+  #include <windows.h>
+  #if defined(WINAPI_FAMILY_PARTITION)
+    #if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+      #define BENCHMARK_OS_WINDOWS_WIN32 1
+    #elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP)
+      #define BENCHMARK_OS_WINDOWS_RT 1
+    #endif
+  #endif
   #if defined(__MINGW32__)
     #define BENCHMARK_OS_MINGW 1
   #endif
@@ -58,6 +73,8 @@
   #define BENCHMARK_OS_NETBSD 1
 #elif defined(__OpenBSD__)
   #define BENCHMARK_OS_OPENBSD 1
+#elif defined(__DragonFly__)
+  #define BENCHMARK_OS_DRAGONFLY 1
 #elif defined(__linux__)
   #define BENCHMARK_OS_LINUX 1
 #elif defined(__native_client__)
@@ -72,6 +89,10 @@
 #define BENCHMARK_OS_SOLARIS 1
 #elif defined(__QNX__)
 #define BENCHMARK_OS_QNX 1
+#elif defined(__MVS__)
+#define BENCHMARK_OS_ZOS 1
+#elif defined(__hexagon__)
+#define BENCHMARK_OS_QURT 1
 #endif
 
 #if defined(__ANDROID__) && defined(__GLIBCXX__)
diff --git a/ThirdParty/googlebenchmark/src/json_reporter.cc b/ThirdParty/googlebenchmark/src/json_reporter.cc
index 11db2b99d5..6559dfd5e6 100644
--- a/ThirdParty/googlebenchmark/src/json_reporter.cc
+++ b/ThirdParty/googlebenchmark/src/json_reporter.cc
@@ -12,9 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "benchmark/benchmark.h"
-#include "complexity.h"
-
 #include <algorithm>
 #include <cmath>
 #include <cstdint>
@@ -25,41 +22,61 @@
 #include <tuple>
 #include <vector>
 
+#include "benchmark/benchmark.h"
+#include "complexity.h"
 #include "string_util.h"
 #include "timers.h"
 
 namespace benchmark {
-
 namespace {
 
-std::string StrEscape(const std::string & s) {
+std::string StrEscape(const std::string& s) {
   std::string tmp;
   tmp.reserve(s.size());
   for (char c : s) {
     switch (c) {
-    case '\b': tmp += "\\b"; break;
-    case '\f': tmp += "\\f"; break;
-    case '\n': tmp += "\\n"; break;
-    case '\r': tmp += "\\r"; break;
-    case '\t': tmp += "\\t"; break;
-    case '\\': tmp += "\\\\"; break;
-    case '"' : tmp += "\\\""; break;
-    default  : tmp += c; break;
+      case '\b':
+        tmp += "\\b";
+        break;
+      case '\f':
+        tmp += "\\f";
+        break;
+      case '\n':
+        tmp += "\\n";
+        break;
+      case '\r':
+        tmp += "\\r";
+        break;
+      case '\t':
+        tmp += "\\t";
+        break;
+      case '\\':
+        tmp += "\\\\";
+        break;
+      case '"':
+        tmp += "\\\"";
+        break;
+      default:
+        tmp += c;
+        break;
     }
   }
   return tmp;
 }
 
 std::string FormatKV(std::string const& key, std::string const& value) {
-  return StrFormat("\"%s\": \"%s\"", StrEscape(key).c_str(), StrEscape(value).c_str());
+  return StrFormat("\"%s\": \"%s\"", StrEscape(key).c_str(),
+                   StrEscape(value).c_str());
 }
 
 std::string FormatKV(std::string const& key, const char* value) {
-  return StrFormat("\"%s\": \"%s\"", StrEscape(key).c_str(), StrEscape(value).c_str());
+  return StrFormat("\"%s\": \"%s\"", StrEscape(key).c_str(),
+                   StrEscape(value).c_str());
 }
 
 std::string FormatKV(std::string const& key, bool value) {
-  return StrFormat("\"%s\": %s", StrEscape(key).c_str(), value ? "true" : "false");
+  return StrFormat("\"%s\": %s", StrEscape(key).c_str(),
+                   value ? "true" : "false");
 }
 
 std::string FormatKV(std::string const& key, int64_t value) {
@@ -68,12 +85,6 @@ std::string FormatKV(std::string const& key, int64_t value) {
   return ss.str();
 }
 
-std::string FormatKV(std::string const& key, IterationCount value) {
-  std::stringstream ss;
-  ss << '"' << StrEscape(key) << "\": " << value;
-  return ss.str();
-}
-
 std::string FormatKV(std::string const& key, double value) {
   std::stringstream ss;
   ss << '"' << StrEscape(key) << "\": ";
@@ -92,7 +103,7 @@ std::string FormatKV(std::string const& key, double value) {
   return ss.str();
 }
 
-int64_t RoundDouble(double v) { return static_cast<int64_t>(v + 0.5); }
+int64_t RoundDouble(double v) { return std::lround(v); }
 
 }  // end namespace
 
@@ -122,8 +133,12 @@ bool JSONReporter::ReportContext(const Context& context) {
       << FormatKV("mhz_per_cpu",
                   RoundDouble(info.cycles_per_second / 1000000.0))
       << ",\n";
-  out << indent << FormatKV("cpu_scaling_enabled", info.scaling_enabled)
-      << ",\n";
+  if (CPUInfo::Scaling::UNKNOWN != info.scaling) {
+    out << indent
+        << FormatKV("cpu_scaling_enabled",
+                    info.scaling == CPUInfo::Scaling::ENABLED ? true : false)
+        << ",\n";
+  }
 
   out << indent << "\"caches\": [\n";
   indent = std::string(6, ' ');
@@ -134,8 +149,8 @@ bool JSONReporter::ReportContext(const Context& context) {
     out << cache_indent << FormatKV("type", CI.type) << ",\n";
     out << cache_indent << FormatKV("level", static_cast<int64_t>(CI.level))
         << ",\n";
-    out << cache_indent
-        << FormatKV("size", static_cast<int64_t>(CI.size) * 1000u) << ",\n";
+    out << cache_indent << FormatKV("size", static_cast<int64_t>(CI.size))
+        << ",\n";
     out << cache_indent
         << FormatKV("num_sharing", static_cast<int64_t>(CI.num_sharing))
         << "\n";
@@ -157,7 +172,19 @@ bool JSONReporter::ReportContext(const Context& context) {
 #else
   const char build_type[] = "debug";
 #endif
-  out << indent << FormatKV("library_build_type", build_type) << "\n";
+  out << indent << FormatKV("library_build_type", build_type);
+
+  std::map<std::string, std::string>* global_context =
+      internal::GetGlobalContext();
+
+  if (global_context != nullptr) {
+    for (const auto& kv : *global_context) {
+      out << ",\n";
+      out << indent << FormatKV(kv.first, kv.second);
+    }
+  }
+  out << "\n";
+
   // Close context block and open the list of benchmarks.
   out << inner_indent << "},\n";
   out << inner_indent << "\"benchmarks\": [\n";
@@ -195,6 +222,10 @@ void JSONReporter::PrintRunData(Run const& run) {
   std::string indent(6, ' ');
   std::ostream& out = GetOutputStream();
   out << indent << FormatKV("name", run.benchmark_name()) << ",\n";
+  out << indent << FormatKV("family_index", run.family_index) << ",\n";
+  out << indent
+      << FormatKV("per_family_instance_index", run.per_family_instance_index)
+      << ",\n";
   out << indent << FormatKV("run_name", run.run_name.str()) << ",\n";
   out << indent << FormatKV("run_type", [&run]() -> const char* {
     switch (run.run_type) {
@@ -213,15 +244,36 @@ void JSONReporter::PrintRunData(Run const& run) {
   out << indent << FormatKV("threads", run.threads) << ",\n";
   if (run.run_type == BenchmarkReporter::Run::RT_Aggregate) {
     out << indent << FormatKV("aggregate_name", run.aggregate_name) << ",\n";
+    out << indent << FormatKV("aggregate_unit", [&run]() -> const char* {
+      switch (run.aggregate_unit) {
+        case StatisticUnit::kTime:
+          return "time";
+        case StatisticUnit::kPercentage:
+          return "percentage";
+      }
+      BENCHMARK_UNREACHABLE();
+    }()) << ",\n";
   }
-  if (run.error_occurred) {
-    out << indent << FormatKV("error_occurred", run.error_occurred) << ",\n";
-    out << indent << FormatKV("error_message", run.error_message) << ",\n";
+  if (internal::SkippedWithError == run.skipped) {
+    out << indent << FormatKV("error_occurred", true) << ",\n";
+    out << indent << FormatKV("error_message", run.skip_message) << ",\n";
+  } else if (internal::SkippedWithMessage == run.skipped) {
+    out << indent << FormatKV("skipped", true) << ",\n";
+    out << indent << FormatKV("skip_message", run.skip_message) << ",\n";
   }
   if (!run.report_big_o && !run.report_rms) {
     out << indent << FormatKV("iterations", run.iterations) << ",\n";
-    out << indent << FormatKV("real_time", run.GetAdjustedRealTime()) << ",\n";
-    out << indent << FormatKV("cpu_time", run.GetAdjustedCPUTime());
+    if (run.run_type != Run::RT_Aggregate ||
+        run.aggregate_unit == StatisticUnit::kTime) {
+      out << indent << FormatKV("real_time", run.GetAdjustedRealTime())
+          << ",\n";
+      out << indent << FormatKV("cpu_time", run.GetAdjustedCPUTime());
+    } else {
+      assert(run.aggregate_unit == StatisticUnit::kPercentage);
+      out << indent << FormatKV("real_time", run.real_accumulated_time)
+          << ",\n";
+      out << indent << FormatKV("cpu_time", run.cpu_accumulated_time);
+    }
     out << ",\n"
         << indent << FormatKV("time_unit", GetTimeUnitString(run.time_unit));
   } else if (run.report_big_o) {
@@ -239,9 +291,21 @@ void JSONReporter::PrintRunData(Run const& run) {
     out << ",\n" << indent << FormatKV(c.first, c.second);
   }
 
-  if (run.has_memory_result) {
+  if (run.memory_result) {
+    const MemoryManager::Result memory_result = *run.memory_result;
     out << ",\n" << indent << FormatKV("allocs_per_iter", run.allocs_per_iter);
-    out << ",\n" << indent << FormatKV("max_bytes_used", run.max_bytes_used);
+    out << ",\n"
+        << indent << FormatKV("max_bytes_used", memory_result.max_bytes_used);
+
+    auto report_if_present = [&out, &indent](const std::string& label,
+                                             int64_t val) {
+      if (val != MemoryManager::TombstoneValue)
+        out << ",\n" << indent << FormatKV(label, val);
+    };
+
+    report_if_present("total_allocated_bytes",
+                      memory_result.total_allocated_bytes);
+    report_if_present("net_heap_growth", memory_result.net_heap_growth);
   }
 
   if (!run.report_label.empty()) {
@@ -250,4 +314,7 @@ void JSONReporter::PrintRunData(Run const& run) {
   out << '\n';
 }
 
+const int64_t MemoryManager::TombstoneValue =
+    std::numeric_limits<int64_t>::max();
+
 }  // end namespace benchmark
diff --git a/ThirdParty/googlebenchmark/src/log.h b/ThirdParty/googlebenchmark/src/log.h
index 47d0c35c01..9a21400b09 100644
--- a/ThirdParty/googlebenchmark/src/log.h
+++ b/ThirdParty/googlebenchmark/src/log.h
@@ -4,7 +4,12 @@
 #include <iostream>
 #include <ostream>
 
-#include "benchmark/benchmark.h"
+// NOTE: this is also defined in benchmark.h but we're trying to avoid a
+// dependency.
+// The _MSVC_LANG check should detect Visual Studio 2015 Update 3 and newer.
+#if __cplusplus >= 201103L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201103L)
+#define BENCHMARK_HAS_CXX11
+#endif
 
 namespace benchmark {
 namespace internal {
@@ -23,7 +28,16 @@ class LogType {
  private:
   LogType(std::ostream* out) : out_(out) {}
   std::ostream* out_;
-  BENCHMARK_DISALLOW_COPY_AND_ASSIGN(LogType);
+
+  // NOTE: we could use BENCHMARK_DISALLOW_COPY_AND_ASSIGN but we shouldn't have
+  // a dependency on benchmark.h from here.
+#ifndef BENCHMARK_HAS_CXX11
+  LogType(const LogType&);
+  LogType& operator=(const LogType&);
+#else
+  LogType(const LogType&) = delete;
+  LogType& operator=(const LogType&) = delete;
+#endif
 };
 
 template <class Tp>
@@ -47,13 +61,13 @@ inline int& LogLevel() {
 }
 
 inline LogType& GetNullLogInstance() {
-  static LogType log(nullptr);
-  return log;
+  static LogType null_log(static_cast<std::ostream*>(nullptr));
+  return null_log;
 }
 
 inline LogType& GetErrorLogInstance() {
-  static LogType log(&std::clog);
-  return log;
+  static LogType error_log(&std::clog);
+  return error_log;
 }
 
 inline LogType& GetLogInstanceForLevel(int level) {
@@ -67,7 +81,7 @@ inline LogType& GetLogInstanceForLevel(int level) {
 }  // end namespace benchmark
 
 // clang-format off
-#define VLOG(x)                                                               \
+#define BM_VLOG(x)                                                               \
   (::benchmark::internal::GetLogInstanceForLevel(x) << "-- LOG(" << x << "):" \
                                                                          " ")
 // clang-format on
diff --git a/ThirdParty/googlebenchmark/src/mutex.h b/ThirdParty/googlebenchmark/src/mutex.h
index 5f461d05a0..bec78d9e5f 100644
--- a/ThirdParty/googlebenchmark/src/mutex.h
+++ b/ThirdParty/googlebenchmark/src/mutex.h
@@ -9,60 +9,60 @@
 // Enable thread safety attributes only with clang.
 // The attributes can be safely erased when compiling with other compilers.
 #if defined(HAVE_THREAD_SAFETY_ATTRIBUTES)
-#define THREAD_ANNOTATION_ATTRIBUTE__(x) __attribute__((x))
+#define THREAD_ANNOTATION_ATTRIBUTE_(x) __attribute__((x))
 #else
-#define THREAD_ANNOTATION_ATTRIBUTE__(x)  // no-op
+#define THREAD_ANNOTATION_ATTRIBUTE_(x)  // no-op
 #endif
 
-#define CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE__(capability(x))
+#define CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE_(capability(x))
 
-#define SCOPED_CAPABILITY THREAD_ANNOTATION_ATTRIBUTE__(scoped_lockable)
+#define SCOPED_CAPABILITY THREAD_ANNOTATION_ATTRIBUTE_(scoped_lockable)
 
-#define GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE__(guarded_by(x))
+#define GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE_(guarded_by(x))
 
-#define PT_GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE__(pt_guarded_by(x))
+#define PT_GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE_(pt_guarded_by(x))
 
 #define ACQUIRED_BEFORE(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(acquired_before(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(acquired_before(__VA_ARGS__))
 
 #define ACQUIRED_AFTER(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(acquired_after(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(acquired_after(__VA_ARGS__))
 
 #define REQUIRES(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(requires_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(requires_capability(__VA_ARGS__))
 
 #define REQUIRES_SHARED(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(requires_shared_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(requires_shared_capability(__VA_ARGS__))
 
 #define ACQUIRE(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(acquire_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(acquire_capability(__VA_ARGS__))
 
 #define ACQUIRE_SHARED(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(acquire_shared_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(acquire_shared_capability(__VA_ARGS__))
 
 #define RELEASE(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(release_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(release_capability(__VA_ARGS__))
 
 #define RELEASE_SHARED(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(release_shared_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(release_shared_capability(__VA_ARGS__))
 
 #define TRY_ACQUIRE(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(try_acquire_capability(__VA_ARGS__))
 
 #define TRY_ACQUIRE_SHARED(...) \
-  THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_shared_capability(__VA_ARGS__))
+  THREAD_ANNOTATION_ATTRIBUTE_(try_acquire_shared_capability(__VA_ARGS__))
 
-#define EXCLUDES(...) THREAD_ANNOTATION_ATTRIBUTE__(locks_excluded(__VA_ARGS__))
+#define EXCLUDES(...) THREAD_ANNOTATION_ATTRIBUTE_(locks_excluded(__VA_ARGS__))
 
-#define ASSERT_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE__(assert_capability(x))
+#define ASSERT_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE_(assert_capability(x))
 
 #define ASSERT_SHARED_CAPABILITY(x) \
-  THREAD_ANNOTATION_ATTRIBUTE__(assert_shared_capability(x))
+  THREAD_ANNOTATION_ATTRIBUTE_(assert_shared_capability(x))
 
-#define RETURN_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE__(lock_returned(x))
+#define RETURN_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE_(lock_returned(x))
 
 #define NO_THREAD_SAFETY_ANALYSIS \
-  THREAD_ANNOTATION_ATTRIBUTE__(no_thread_safety_analysis)
+  THREAD_ANNOTATION_ATTRIBUTE_(no_thread_safety_analysis)
 
 namespace benchmark {
 
@@ -71,7 +71,7 @@ typedef std::condition_variable Condition;
 // NOTE: Wrappers for std::mutex and std::unique_lock are provided so that
 // we can annotate them with thread safety attributes and use the
 // -Wthread-safety warning with clang. The standard library types cannot be
-// used directly because they do not provided the required annotations.
+// used directly because they do not provide the required annotations.
 class CAPABILITY("mutex") Mutex {
  public:
   Mutex() {}
@@ -130,7 +130,7 @@ class Barrier {
   // entered the barrier.  Returns iff this is the last thread to
   // enter the barrier.
   bool createBarrier(MutexLock& ml) REQUIRES(lock_) {
-    CHECK_LT(entered_, running_threads_);
+    BM_CHECK_LT(entered_, running_threads_);
     entered_++;
     if (entered_ < running_threads_) {
       // Wait for all threads to enter
diff --git a/ThirdParty/googlebenchmark/src/perf_counters.cc b/ThirdParty/googlebenchmark/src/perf_counters.cc
new file mode 100644
index 0000000000..417acdb18f
--- /dev/null
+++ b/ThirdParty/googlebenchmark/src/perf_counters.cc
@@ -0,0 +1,282 @@
+// Copyright 2021 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "perf_counters.h"
+
+#include <cstring>
+#include <memory>
+#include <vector>
+
+#if defined HAVE_LIBPFM
+#include "perfmon/pfmlib.h"
+#include "perfmon/pfmlib_perf_event.h"
+#endif
+
+namespace benchmark {
+namespace internal {
+
+constexpr size_t PerfCounterValues::kMaxCounters;
+
+#if defined HAVE_LIBPFM
+
+size_t PerfCounterValues::Read(const std::vector<int>& leaders) {
+  // Create a pointer for multiple reads
+  const size_t bufsize = values_.size() * sizeof(values_[0]);
+  char* ptr = reinterpret_cast<char*>(values_.data());
+  size_t size = bufsize;
+  for (int lead : leaders) {
+    auto read_bytes = ::read(lead, ptr, size);
+    if (read_bytes >= ssize_t(sizeof(uint64_t))) {
+      // Actual data bytes are all bytes minus initial padding
+      std::size_t data_bytes = read_bytes - sizeof(uint64_t);
+      // This should be very cheap since it's in hot cache
+      std::memmove(ptr, ptr + sizeof(uint64_t), data_bytes);
+      // Increment our counters
+      ptr += data_bytes;
+      size -= data_bytes;
+    } else {
+      int err = errno;
+      GetErrorLogInstance() << "Error reading lead " << lead << " errno:" << err
+                            << " " << ::strerror(err) << "\n";
+      return 0;
+    }
+  }
+  return (bufsize - size) / sizeof(uint64_t);
+}
+
+const bool PerfCounters::kSupported = true;
+
+// Initializes libpfm only on the first call.  Returns whether that single
+// initialization was successful.
+bool PerfCounters::Initialize() {
+  // Function-scope static gets initialized only once on first call.
+  static const bool success = []() {
+    return pfm_initialize() == PFM_SUCCESS;
+  }();
+  return success;
+}
+
+bool PerfCounters::IsCounterSupported(const std::string& name) {
+  Initialize();
+  perf_event_attr_t attr;
+  std::memset(&attr, 0, sizeof(attr));
+  pfm_perf_encode_arg_t arg;
+  std::memset(&arg, 0, sizeof(arg));
+  arg.attr = &attr;
+  const int mode = PFM_PLM3;  // user mode only
+  int ret = pfm_get_os_event_encoding(name.c_str(), mode, PFM_OS_PERF_EVENT_EXT,
+                                      &arg);
+  return (ret == PFM_SUCCESS);
+}
+
+PerfCounters PerfCounters::Create(
+    const std::vector<std::string>& counter_names) {
+  if (!counter_names.empty()) {
+    Initialize();
+  }
+
+  // Valid counters will populate these arrays but we start empty
+  std::vector<std::string> valid_names;
+  std::vector<int> counter_ids;
+  std::vector<int> leader_ids;
+
+  // Resize to the maximum possible
+  valid_names.reserve(counter_names.size());
+  counter_ids.reserve(counter_names.size());
+
+  const int kCounterMode = PFM_PLM3;  // user mode only
+
+  // Group leads will be assigned on demand. The idea is that once we cannot
+  // create a counter descriptor, the reason is that this group has maxed out
+  // so we set the group_id again to -1 and retry - giving the algorithm a
+  // chance to create a new group leader to hold the next set of counters.
+  int group_id = -1;
+
+  // Loop through all performance counters
+  for (size_t i = 0; i < counter_names.size(); ++i) {
+    // we are about to push into the valid names vector
+    // check if we did not reach the maximum
+    if (valid_names.size() == PerfCounterValues::kMaxCounters) {
+      // Log a message if we maxed out and stop adding
+      GetErrorLogInstance()
+          << counter_names.size() << " counters were requested. The maximum is "
+          << PerfCounterValues::kMaxCounters << " and " << valid_names.size()
+          << " were already added. All remaining counters will be ignored\n";
+      // stop the loop and return what we have already
+      break;
+    }
+
+    // Check if this name is empty
+    const auto& name = counter_names[i];
+    if (name.empty()) {
+      GetErrorLogInstance()
+          << "A performance counter name was the empty string\n";
+      continue;
+    }
+
+    // Here first means first in group, ie the group leader
+    const bool is_first = (group_id < 0);
+
+    // This struct will be populated by libpfm from the counter string
+    // and then fed into the syscall perf_event_open
+    struct perf_event_attr attr {};
+    attr.size = sizeof(attr);
+
+    // This is the input struct to libpfm.
+    pfm_perf_encode_arg_t arg{};
+    arg.attr = &attr;
+    const int pfm_get = pfm_get_os_event_encoding(name.c_str(), kCounterMode,
+                                                  PFM_OS_PERF_EVENT, &arg);
+    if (pfm_get != PFM_SUCCESS) {
+      GetErrorLogInstance()
+          << "Unknown performance counter name: " << name << "\n";
+      continue;
+    }
+
+    // We then proceed to populate the remaining fields in our attribute struct
+    // Note: the man page for perf_event_create suggests inherit = true and
+    // read_format = PERF_FORMAT_GROUP don't work together, but that's not the
+    // case.
+    attr.disabled = is_first;
+    attr.inherit = true;
+    attr.pinned = is_first;
+    attr.exclude_kernel = true;
+    attr.exclude_user = false;
+    attr.exclude_hv = true;
+
+    // Read all counters in a group in one read.
+    attr.read_format = PERF_FORMAT_GROUP;
+
+    int id = -1;
+    while (id < 0) {
+      static constexpr size_t kNrOfSyscallRetries = 5;
+      // Retry syscall as it was interrupted often (b/64774091).
+      for (size_t num_retries = 0; num_retries < kNrOfSyscallRetries;
+           ++num_retries) {
+        id = perf_event_open(&attr, 0, -1, group_id, 0);
+        if (id >= 0 || errno != EINTR) {
+          break;
+        }
+      }
+      if (id < 0) {
+        // If the file descriptor is negative we might have reached a limit
+        // in the current group. Set the group_id to -1 and retry
+        if (group_id >= 0) {
+          // Create a new group
+          group_id = -1;
+        } else {
+          // At this point we have already retried to set a new group id and
+          // failed. We then give up.
+          break;
+        }
+      }
+    }
+
+    // We failed to get a new file descriptor. We might have reached a hard
+    // hardware limit that cannot be resolved even with group multiplexing
+    if (id < 0) {
+      GetErrorLogInstance() << "***WARNING** Failed to get a file descriptor "
+                               "for performance counter "
+                            << name << ". Ignoring\n";
+
+      // We give up on this counter but try to keep going
+      // as the others would be fine
+      continue;
+    }
+    if (group_id < 0) {
+      // This is a leader, store and assign it to the current file descriptor
+      leader_ids.push_back(id);
+      group_id = id;
+    }
+    // This is a valid counter, add it to our descriptor's list
+    counter_ids.push_back(id);
+    valid_names.push_back(name);
+  }
+
+  // Loop through all group leaders activating them
+  // There is another option of starting ALL counters in a process but
+  // that would be far reaching an intrusion. If the user is using PMCs
+  // by themselves then this would have a side effect on them. It is
+  // friendlier to loop through all groups individually.
+  for (int lead : leader_ids) {
+    if (ioctl(lead, PERF_EVENT_IOC_ENABLE) != 0) {
+      // This should never happen but if it does, we give up on the
+      // entire batch as recovery would be a mess.
+      GetErrorLogInstance() << "***WARNING*** Failed to start counters. "
+                               "Claring out all counters.\n";
+
+      // Close all peformance counters
+      for (int id : counter_ids) {
+        ::close(id);
+      }
+
+      // Return an empty object so our internal state is still good and
+      // the process can continue normally without impact
+      return NoCounters();
+    }
+  }
+
+  return PerfCounters(std::move(valid_names), std::move(counter_ids),
+                      std::move(leader_ids));
+}
+
+void PerfCounters::CloseCounters() const {
+  if (counter_ids_.empty()) {
+    return;
+  }
+  for (int lead : leader_ids_) {
+    ioctl(lead, PERF_EVENT_IOC_DISABLE);
+  }
+  for (int fd : counter_ids_) {
+    close(fd);
+  }
+}
+#else   // defined HAVE_LIBPFM
+size_t PerfCounterValues::Read(const std::vector<int>&) { return 0; }
+
+const bool PerfCounters::kSupported = false;
+
+bool PerfCounters::Initialize() { return false; }
+
+bool PerfCounters::IsCounterSupported(const std::string&) { return false; }
+
+PerfCounters PerfCounters::Create(
+    const std::vector<std::string>& counter_names) {
+  if (!counter_names.empty()) {
+    GetErrorLogInstance() << "Performance counters not supported.";
+  }
+  return NoCounters();
+}
+
+void PerfCounters::CloseCounters() const {}
+#endif  // defined HAVE_LIBPFM
+
+PerfCountersMeasurement::PerfCountersMeasurement(
+    const std::vector<std::string>& counter_names)
+    : start_values_(counter_names.size()), end_values_(counter_names.size()) {
+  counters_ = PerfCounters::Create(counter_names);
+}
+
+PerfCounters& PerfCounters::operator=(PerfCounters&& other) noexcept {
+  if (this != &other) {
+    CloseCounters();
+
+    counter_ids_ = std::move(other.counter_ids_);
+    leader_ids_ = std::move(other.leader_ids_);
+    counter_names_ = std::move(other.counter_names_);
+  }
+  return *this;
+}
+}  // namespace internal
+}  // namespace benchmark
diff --git a/ThirdParty/googlebenchmark/src/perf_counters.h b/ThirdParty/googlebenchmark/src/perf_counters.h
new file mode 100644
index 0000000000..bf5eb6bc3a
--- /dev/null
+++ b/ThirdParty/googlebenchmark/src/perf_counters.h
@@ -0,0 +1,200 @@
+// Copyright 2021 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef BENCHMARK_PERF_COUNTERS_H
+#define BENCHMARK_PERF_COUNTERS_H
+
+#include <array>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <vector>
+
+#include "benchmark/benchmark.h"
+#include "check.h"
+#include "log.h"
+#include "mutex.h"
+
+#ifndef BENCHMARK_OS_WINDOWS
+#include <unistd.h>
+#endif
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+// C4251: <symbol> needs to have dll-interface to be used by clients of class
+#pragma warning(disable : 4251)
+#endif
+
+namespace benchmark {
+namespace internal {
+
+// Typically, we can only read a small number of counters. There is also a
+// padding preceding counter values, when reading multiple counters with one
+// syscall (which is desirable). PerfCounterValues abstracts these details.
+// The implementation ensures the storage is inlined, and allows 0-based
+// indexing into the counter values.
+// The object is used in conjunction with a PerfCounters object, by passing it
+// to Snapshot(). The Read() method relocates individual reads, discarding
+// the initial padding from each group leader in the values buffer such that
+// all user accesses through the [] operator are correct.
+class BENCHMARK_EXPORT PerfCounterValues {
+ public:
+  explicit PerfCounterValues(size_t nr_counters) : nr_counters_(nr_counters) {
+    BM_CHECK_LE(nr_counters_, kMaxCounters);
+  }
+
+  // We are reading correctly now so the values don't need to skip padding
+  uint64_t operator[](size_t pos) const { return values_[pos]; }
+
+  // Increased the maximum to 32 only since the buffer
+  // is std::array<> backed
+  static constexpr size_t kMaxCounters = 32;
+
+ private:
+  friend class PerfCounters;
+  // Get the byte buffer in which perf counters can be captured.
+  // This is used by PerfCounters::Read
+  std::pair<char*, size_t> get_data_buffer() {
+    return {reinterpret_cast<char*>(values_.data()),
+            sizeof(uint64_t) * (kPadding + nr_counters_)};
+  }
+
+  // This reading is complex and as the goal of this class is to
+  // abstract away the intrincacies of the reading process, this is
+  // a better place for it
+  size_t Read(const std::vector<int>& leaders);
+
+  // Move the padding to 2 due to the reading algorithm (1st padding plus a
+  // current read padding)
+  static constexpr size_t kPadding = 2;
+  std::array<uint64_t, kPadding + kMaxCounters> values_;
+  const size_t nr_counters_;
+};
+
+// Collect PMU counters. The object, once constructed, is ready to be used by
+// calling read(). PMU counter collection is enabled from the time create() is
+// called, to obtain the object, until the object's destructor is called.
+class BENCHMARK_EXPORT PerfCounters final {
+ public:
+  // True iff this platform supports performance counters.
+  static const bool kSupported;
+
+  // Returns an empty object
+  static PerfCounters NoCounters() { return PerfCounters(); }
+
+  ~PerfCounters() { CloseCounters(); }
+  PerfCounters() = default;
+  PerfCounters(PerfCounters&&) = default;
+  PerfCounters(const PerfCounters&) = delete;
+  PerfCounters& operator=(PerfCounters&&) noexcept;
+  PerfCounters& operator=(const PerfCounters&) = delete;
+
+  // Platform-specific implementations may choose to do some library
+  // initialization here.
+  static bool Initialize();
+
+  // Check if the given counter is supported, if the app wants to
+  // check before passing
+  static bool IsCounterSupported(const std::string& name);
+
+  // Return a PerfCounters object ready to read the counters with the names
+  // specified. The values are user-mode only. The counter name format is
+  // implementation and OS specific.
+  // In case of failure, this method will in the worst case return an
+  // empty object whose state will still be valid.
+  static PerfCounters Create(const std::vector<std::string>& counter_names);
+
+  // Take a snapshot of the current value of the counters into the provided
+  // valid PerfCounterValues storage. The values are populated such that:
+  // names()[i]'s value is (*values)[i]
+  BENCHMARK_ALWAYS_INLINE bool Snapshot(PerfCounterValues* values) const {
+#ifndef BENCHMARK_OS_WINDOWS
+    assert(values != nullptr);
+    return values->Read(leader_ids_) == counter_ids_.size();
+#else
+    (void)values;
+    return false;
+#endif
+  }
+
+  const std::vector<std::string>& names() const { return counter_names_; }
+  size_t num_counters() const { return counter_names_.size(); }
+
+ private:
+  PerfCounters(const std::vector<std::string>& counter_names,
+               std::vector<int>&& counter_ids, std::vector<int>&& leader_ids)
+      : counter_ids_(std::move(counter_ids)),
+        leader_ids_(std::move(leader_ids)),
+        counter_names_(counter_names) {}
+
+  void CloseCounters() const;
+
+  std::vector<int> counter_ids_;
+  std::vector<int> leader_ids_;
+  std::vector<std::string> counter_names_;
+};
+
+// Typical usage of the above primitives.
+class BENCHMARK_EXPORT PerfCountersMeasurement final {
+ public:
+  PerfCountersMeasurement(const std::vector<std::string>& counter_names);
+
+  size_t num_counters() const { return counters_.num_counters(); }
+
+  std::vector<std::string> names() const { return counters_.names(); }
+
+  BENCHMARK_ALWAYS_INLINE bool Start() {
+    if (num_counters() == 0) return true;
+    // Tell the compiler to not move instructions above/below where we take
+    // the snapshot.
+    ClobberMemory();
+    valid_read_ &= counters_.Snapshot(&start_values_);
+    ClobberMemory();
+
+    return valid_read_;
+  }
+
+  BENCHMARK_ALWAYS_INLINE bool Stop(
+      std::vector<std::pair<std::string, double>>& measurements) {
+    if (num_counters() == 0) return true;
+    // Tell the compiler to not move instructions above/below where we take
+    // the snapshot.
+    ClobberMemory();
+    valid_read_ &= counters_.Snapshot(&end_values_);
+    ClobberMemory();
+
+    for (size_t i = 0; i < counters_.names().size(); ++i) {
+      double measurement = static_cast<double>(end_values_[i]) -
+                           static_cast<double>(start_values_[i]);
+      measurements.push_back({counters_.names()[i], measurement});
+    }
+
+    return valid_read_;
+  }
+
+ private:
+  PerfCounters counters_;
+  bool valid_read_ = true;
+  PerfCounterValues start_values_;
+  PerfCounterValues end_values_;
+};
+
+}  // namespace internal
+}  // namespace benchmark
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
+#endif  // BENCHMARK_PERF_COUNTERS_H
diff --git a/ThirdParty/googlebenchmark/src/re.h b/ThirdParty/googlebenchmark/src/re.h
index fbe25037b4..9afb869bea 100644
--- a/ThirdParty/googlebenchmark/src/re.h
+++ b/ThirdParty/googlebenchmark/src/re.h
@@ -33,7 +33,7 @@
 // Prefer C regex libraries when compiling w/o exceptions so that we can
 // correctly report errors.
 #if defined(BENCHMARK_HAS_NO_EXCEPTIONS) && \
-    defined(BENCHMARK_HAVE_STD_REGEX) && \
+    defined(HAVE_STD_REGEX) && \
     (defined(HAVE_GNU_POSIX_REGEX) || defined(HAVE_POSIX_REGEX))
   #undef HAVE_STD_REGEX
 #endif
@@ -126,7 +126,7 @@ inline bool Regex::Init(const std::string& spec, std::string* error) {
 
       // regerror returns the number of bytes necessary to null terminate
       // the string, so we move that when assigning to error.
-      CHECK_NE(needed, 0);
+      BM_CHECK_NE(needed, 0);
       error->assign(errbuf, needed - 1);
 
       delete[] errbuf;
diff --git a/ThirdParty/googlebenchmark/src/reporter.cc b/ThirdParty/googlebenchmark/src/reporter.cc
index 4d3e477d44..076bc31a2e 100644
--- a/ThirdParty/googlebenchmark/src/reporter.cc
+++ b/ThirdParty/googlebenchmark/src/reporter.cc
@@ -12,17 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "benchmark/benchmark.h"
-#include "timers.h"
-
 #include <cstdlib>
-
 #include <iostream>
+#include <map>
+#include <string>
 #include <tuple>
 #include <vector>
 
+#include "benchmark/benchmark.h"
 #include "check.h"
 #include "string_util.h"
+#include "timers.h"
 
 namespace benchmark {
 
@@ -33,10 +33,14 @@ BenchmarkReporter::~BenchmarkReporter() {}
 
 void BenchmarkReporter::PrintBasicContext(std::ostream *out,
                                           Context const &context) {
-  CHECK(out) << "cannot be null";
+  BM_CHECK(out) << "cannot be null";
   auto &Out = *out;
 
+#ifndef BENCHMARK_OS_QURT
+  // Date/time information is not available on QuRT.
+  // Attempting to get it via this call cause the binary to crash.
   Out << LocalDateTimeString() << "\n";
+#endif
 
   if (context.executable_name)
     Out << "Running " << context.executable_name << "\n";
@@ -49,7 +53,7 @@ void BenchmarkReporter::PrintBasicContext(std::ostream *out,
     Out << "CPU Caches:\n";
     for (auto &CInfo : info.caches) {
       Out << "  L" << CInfo.level << " " << CInfo.type << " "
-          << (CInfo.size / 1000) << "K";
+          << (CInfo.size / 1024) << " KiB";
       if (CInfo.num_sharing != 0)
         Out << " (x" << (info.num_cpus / CInfo.num_sharing) << ")";
       Out << "\n";
@@ -64,7 +68,16 @@ void BenchmarkReporter::PrintBasicContext(std::ostream *out,
     Out << "\n";
   }
 
-  if (info.scaling_enabled) {
+  std::map<std::string, std::string> *global_context =
+      internal::GetGlobalContext();
+
+  if (global_context != nullptr) {
+    for (const auto &kv : *global_context) {
+      Out << kv.first << ": " << kv.second << "\n";
+    }
+  }
+
+  if (CPUInfo::Scaling::ENABLED == info.scaling) {
     Out << "***WARNING*** CPU scaling is enabled, the benchmark "
            "real time measurements may be noisy and will incur extra "
            "overhead.\n";
diff --git a/ThirdParty/googlebenchmark/src/sleep.cc b/ThirdParty/googlebenchmark/src/sleep.cc
deleted file mode 100644
index 1512ac90f7..0000000000
--- a/ThirdParty/googlebenchmark/src/sleep.cc
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "sleep.h"
-
-#include <cerrno>
-#include <cstdlib>
-#include <ctime>
-
-#include "internal_macros.h"
-
-#ifdef BENCHMARK_OS_WINDOWS
-#include <windows.h>
-#endif
-
-namespace benchmark {
-#ifdef BENCHMARK_OS_WINDOWS
-// Window's Sleep takes milliseconds argument.
-void SleepForMilliseconds(int milliseconds) { Sleep(milliseconds); }
-void SleepForSeconds(double seconds) {
-  SleepForMilliseconds(static_cast<int>(kNumMillisPerSecond * seconds));
-}
-#else   // BENCHMARK_OS_WINDOWS
-void SleepForMicroseconds(int microseconds) {
-  struct timespec sleep_time;
-  sleep_time.tv_sec = microseconds / kNumMicrosPerSecond;
-  sleep_time.tv_nsec = (microseconds % kNumMicrosPerSecond) * kNumNanosPerMicro;
-  while (nanosleep(&sleep_time, &sleep_time) != 0 && errno == EINTR)
-    ;  // Ignore signals and wait for the full interval to elapse.
-}
-
-void SleepForMilliseconds(int milliseconds) {
-  SleepForMicroseconds(milliseconds * kNumMicrosPerMilli);
-}
-
-void SleepForSeconds(double seconds) {
-  SleepForMicroseconds(static_cast<int>(seconds * kNumMicrosPerSecond));
-}
-#endif  // BENCHMARK_OS_WINDOWS
-}  // end namespace benchmark
diff --git a/ThirdParty/googlebenchmark/src/sleep.h b/ThirdParty/googlebenchmark/src/sleep.h
deleted file mode 100644
index f98551afe2..0000000000
--- a/ThirdParty/googlebenchmark/src/sleep.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef BENCHMARK_SLEEP_H_
-#define BENCHMARK_SLEEP_H_
-
-namespace benchmark {
-const int kNumMillisPerSecond = 1000;
-const int kNumMicrosPerMilli = 1000;
-const int kNumMicrosPerSecond = kNumMillisPerSecond * 1000;
-const int kNumNanosPerMicro = 1000;
-const int kNumNanosPerSecond = kNumNanosPerMicro * kNumMicrosPerSecond;
-
-void SleepForMilliseconds(int milliseconds);
-void SleepForSeconds(double seconds);
-}  // end namespace benchmark
-
-#endif  // BENCHMARK_SLEEP_H_
diff --git a/ThirdParty/googlebenchmark/src/statistics.cc b/ThirdParty/googlebenchmark/src/statistics.cc
index bd5a3d6597..844e926895 100644
--- a/ThirdParty/googlebenchmark/src/statistics.cc
+++ b/ThirdParty/googlebenchmark/src/statistics.cc
@@ -13,15 +13,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "benchmark/benchmark.h"
+#include "statistics.h"
 
 #include <algorithm>
 #include <cmath>
 #include <numeric>
 #include <string>
 #include <vector>
+
+#include "benchmark/benchmark.h"
 #include "check.h"
-#include "statistics.h"
 
 namespace benchmark {
 
@@ -41,13 +42,13 @@ double StatisticsMedian(const std::vector<double>& v) {
   auto center = copy.begin() + v.size() / 2;
   std::nth_element(copy.begin(), center, copy.end());
 
-  // did we have an odd number of samples?
-  // if yes, then center is the median
-  // it no, then we are looking for the average between center and the value
-  // before
+  // Did we have an odd number of samples?  If yes, then center is the median.
+  // If not, then we are looking for the average between center and the value
+  // before.  Instead of resorting, we just look for the max value before it,
+  // which is not necessarily the element immediately preceding `center` Since
+  // `copy` is only partially sorted by `nth_element`.
   if (v.size() % 2 == 1) return *center;
-  auto center2 = copy.begin() + v.size() / 2 - 1;
-  std::nth_element(copy.begin(), center2, copy.end());
+  auto center2 = std::max_element(copy.begin(), center);
   return (*center + *center2) / 2.0;
 }
 
@@ -74,14 +75,22 @@ double StatisticsStdDev(const std::vector<double>& v) {
   return Sqrt(v.size() / (v.size() - 1.0) * (avg_squares - Sqr(mean)));
 }
 
+double StatisticsCV(const std::vector<double>& v) {
+  if (v.size() < 2) return 0.0;
+
+  const auto stddev = StatisticsStdDev(v);
+  const auto mean = StatisticsMean(v);
+
+  return stddev / mean;
+}
+
 std::vector<BenchmarkReporter::Run> ComputeStats(
     const std::vector<BenchmarkReporter::Run>& reports) {
   typedef BenchmarkReporter::Run Run;
   std::vector<Run> results;
 
-  auto error_count =
-      std::count_if(reports.begin(), reports.end(),
-                    [](Run const& run) { return run.error_occurred; });
+  auto error_count = std::count_if(reports.begin(), reports.end(),
+                                   [](Run const& run) { return run.skipped; });
 
   if (reports.size() - error_count < 2) {
     // We don't report aggregated data if there was a single run.
@@ -108,26 +117,28 @@ std::vector<BenchmarkReporter::Run> ComputeStats(
     for (auto const& cnt : r.counters) {
       auto it = counter_stats.find(cnt.first);
       if (it == counter_stats.end()) {
-        counter_stats.insert({cnt.first, {cnt.second, std::vector<double>{}}});
-        it = counter_stats.find(cnt.first);
+        it = counter_stats
+                 .emplace(cnt.first,
+                          CounterStat{cnt.second, std::vector<double>{}})
+                 .first;
         it->second.s.reserve(reports.size());
       } else {
-        CHECK_EQ(counter_stats[cnt.first].c.flags, cnt.second.flags);
+        BM_CHECK_EQ(it->second.c.flags, cnt.second.flags);
       }
     }
   }
 
   // Populate the accumulators.
   for (Run const& run : reports) {
-    CHECK_EQ(reports[0].benchmark_name(), run.benchmark_name());
-    CHECK_EQ(run_iterations, run.iterations);
-    if (run.error_occurred) continue;
+    BM_CHECK_EQ(reports[0].benchmark_name(), run.benchmark_name());
+    BM_CHECK_EQ(run_iterations, run.iterations);
+    if (run.skipped) continue;
     real_accumulated_time_stat.emplace_back(run.real_accumulated_time);
     cpu_accumulated_time_stat.emplace_back(run.cpu_accumulated_time);
     // user counters
     for (auto const& cnt : run.counters) {
       auto it = counter_stats.find(cnt.first);
-      CHECK_NE(it, counter_stats.end());
+      BM_CHECK_NE(it, counter_stats.end());
       it->second.s.emplace_back(cnt.second);
     }
   }
@@ -148,11 +159,14 @@ std::vector<BenchmarkReporter::Run> ComputeStats(
     // Get the data from the accumulator to BenchmarkReporter::Run's.
     Run data;
     data.run_name = reports[0].run_name;
+    data.family_index = reports[0].family_index;
+    data.per_family_instance_index = reports[0].per_family_instance_index;
     data.run_type = BenchmarkReporter::Run::RT_Aggregate;
     data.threads = reports[0].threads;
     data.repetitions = reports[0].repetitions;
     data.repetition_index = Run::no_repetition_index;
     data.aggregate_name = Stat.name_;
+    data.aggregate_unit = Stat.unit_;
     data.report_label = report_label;
 
     // It is incorrect to say that an aggregate is computed over
@@ -165,13 +179,15 @@ std::vector<BenchmarkReporter::Run> ComputeStats(
     data.real_accumulated_time = Stat.compute_(real_accumulated_time_stat);
     data.cpu_accumulated_time = Stat.compute_(cpu_accumulated_time_stat);
 
-    // We will divide these times by data.iterations when reporting, but the
-    // data.iterations is not nessesairly the scale of these measurements,
-    // because in each repetition, these timers are sum over all the iterations.
-    // And if we want to say that the stats are over N repetitions and not
-    // M iterations, we need to multiply these by (N/M).
-    data.real_accumulated_time *= iteration_rescale_factor;
-    data.cpu_accumulated_time *= iteration_rescale_factor;
+    if (data.aggregate_unit == StatisticUnit::kTime) {
+      // We will divide these times by data.iterations when reporting, but the
+      // data.iterations is not necessarily the scale of these measurements,
+      // because in each repetition, these timers are sum over all the iters.
+      // And if we want to say that the stats are over N repetitions and not
+      // M iterations, we need to multiply these by (N/M).
+      data.real_accumulated_time *= iteration_rescale_factor;
+      data.cpu_accumulated_time *= iteration_rescale_factor;
+    }
 
     data.time_unit = reports[0].time_unit;
 
diff --git a/ThirdParty/googlebenchmark/src/statistics.h b/ThirdParty/googlebenchmark/src/statistics.h
index 7eccc85536..6e5560e8f1 100644
--- a/ThirdParty/googlebenchmark/src/statistics.h
+++ b/ThirdParty/googlebenchmark/src/statistics.h
@@ -22,15 +22,22 @@
 
 namespace benchmark {
 
-// Return a vector containing the mean, median and standard devation information
-// (and any user-specified info) for the specified list of reports. If 'reports'
-// contains less than two non-errored runs an empty vector is returned
+// Return a vector containing the mean, median and standard deviation
+// information (and any user-specified info) for the specified list of reports.
+// If 'reports' contains less than two non-errored runs an empty vector is
+// returned
+BENCHMARK_EXPORT
 std::vector<BenchmarkReporter::Run> ComputeStats(
     const std::vector<BenchmarkReporter::Run>& reports);
 
+BENCHMARK_EXPORT
 double StatisticsMean(const std::vector<double>& v);
+BENCHMARK_EXPORT
 double StatisticsMedian(const std::vector<double>& v);
+BENCHMARK_EXPORT
 double StatisticsStdDev(const std::vector<double>& v);
+BENCHMARK_EXPORT
+double StatisticsCV(const std::vector<double>& v);
 
 }  // end namespace benchmark
 
diff --git a/ThirdParty/googlebenchmark/src/string_util.cc b/ThirdParty/googlebenchmark/src/string_util.cc
index 39b01a1719..c69e40a813 100644
--- a/ThirdParty/googlebenchmark/src/string_util.cc
+++ b/ThirdParty/googlebenchmark/src/string_util.cc
@@ -1,6 +1,9 @@
 #include "string_util.h"
 
 #include <array>
+#ifdef BENCHMARK_STL_ANDROID_GNUSTL
+#include <cerrno>
+#endif
 #include <cmath>
 #include <cstdarg>
 #include <cstdio>
@@ -8,16 +11,17 @@
 #include <sstream>
 
 #include "arraysize.h"
+#include "benchmark/benchmark.h"
 
 namespace benchmark {
 namespace {
-
 // kilo, Mega, Giga, Tera, Peta, Exa, Zetta, Yotta.
-const char kBigSIUnits[] = "kMGTPEZY";
+const char* const kBigSIUnits[] = {"k", "M", "G", "T", "P", "E", "Z", "Y"};
 // Kibi, Mebi, Gibi, Tebi, Pebi, Exbi, Zebi, Yobi.
-const char kBigIECUnits[] = "KMGTPEZY";
+const char* const kBigIECUnits[] = {"Ki", "Mi", "Gi", "Ti",
+                                    "Pi", "Ei", "Zi", "Yi"};
 // milli, micro, nano, pico, femto, atto, zepto, yocto.
-const char kSmallSIUnits[] = "munpfazy";
+const char* const kSmallSIUnits[] = {"m", "u", "n", "p", "f", "a", "z", "y"};
 
 // We require that all three arrays have the same size.
 static_assert(arraysize(kBigSIUnits) == arraysize(kBigIECUnits),
@@ -27,9 +31,8 @@ static_assert(arraysize(kSmallSIUnits) == arraysize(kBigSIUnits),
 
 static const int64_t kUnitsSize = arraysize(kBigSIUnits);
 
-void ToExponentAndMantissa(double val, double thresh, int precision,
-                           double one_k, std::string* mantissa,
-                           int64_t* exponent) {
+void ToExponentAndMantissa(double val, int precision, double one_k,
+                           std::string* mantissa, int64_t* exponent) {
   std::stringstream mantissa_stream;
 
   if (val < 0) {
@@ -40,8 +43,8 @@ void ToExponentAndMantissa(double val, double thresh, int precision,
   // Adjust threshold so that it never excludes things which can't be rendered
   // in 'precision' digits.
   const double adjusted_threshold =
-      std::max(thresh, 1.0 / std::pow(10.0, precision));
-  const double big_threshold = adjusted_threshold * one_k;
+      std::max(1.0, 1.0 / std::pow(10.0, precision));
+  const double big_threshold = (adjusted_threshold * one_k) - 1;
   const double small_threshold = adjusted_threshold;
   // Values in ]simple_threshold,small_threshold[ will be printed as-is
   const double simple_threshold = 0.01;
@@ -89,37 +92,20 @@ std::string ExponentToPrefix(int64_t exponent, bool iec) {
   const int64_t index = (exponent > 0 ? exponent - 1 : -exponent - 1);
   if (index >= kUnitsSize) return "";
 
-  const char* array =
+  const char* const* array =
       (exponent > 0 ? (iec ? kBigIECUnits : kBigSIUnits) : kSmallSIUnits);
-  if (iec)
-    return array[index] + std::string("i");
-  else
-    return std::string(1, array[index]);
+
+  return std::string(array[index]);
 }
 
-std::string ToBinaryStringFullySpecified(double value, double threshold,
-                                         int precision, double one_k = 1024.0) {
+std::string ToBinaryStringFullySpecified(double value, int precision,
+                                         Counter::OneK one_k) {
   std::string mantissa;
   int64_t exponent;
-  ToExponentAndMantissa(value, threshold, precision, one_k, &mantissa,
+  ToExponentAndMantissa(value, precision,
+                        one_k == Counter::kIs1024 ? 1024.0 : 1000.0, &mantissa,
                         &exponent);
-  return mantissa + ExponentToPrefix(exponent, false);
-}
-
-}  // end namespace
-
-void AppendHumanReadable(int n, std::string* str) {
-  std::stringstream ss;
-  // Round down to the nearest SI prefix.
-  ss << ToBinaryStringFullySpecified(n, 1.0, 0);
-  *str += ss.str();
-}
-
-std::string HumanReadableNumber(double n, double one_k) {
-  // 1.1 means that figures up to 1.1k should be shown with the next unit down;
-  // this softens edge effects.
-  // 1 means that we should show one decimal place of precision.
-  return ToBinaryStringFullySpecified(n, 1.1, 1, one_k);
+  return mantissa + ExponentToPrefix(exponent, one_k == Counter::kIs1024);
 }
 
 std::string StrFormatImp(const char* msg, va_list args) {
@@ -130,28 +116,34 @@ std::string StrFormatImp(const char* msg, va_list args) {
   // TODO(ericwf): use std::array for first attempt to avoid one memory
   // allocation guess what the size might be
   std::array<char, 256> local_buff;
-  std::size_t size = local_buff.size();
+
   // 2015-10-08: vsnprintf is used instead of snd::vsnprintf due to a limitation
   // in the android-ndk
-  auto ret = vsnprintf(local_buff.data(), size, msg, args_cp);
+  auto ret = vsnprintf(local_buff.data(), local_buff.size(), msg, args_cp);
 
   va_end(args_cp);
 
   // handle empty expansion
   if (ret == 0) return std::string{};
-  if (static_cast<std::size_t>(ret) < size)
+  if (static_cast<std::size_t>(ret) < local_buff.size())
     return std::string(local_buff.data());
 
   // we did not provide a long enough buffer on our first attempt.
   // add 1 to size to account for null-byte in size cast to prevent overflow
-  size = static_cast<std::size_t>(ret) + 1;
+  std::size_t size = static_cast<std::size_t>(ret) + 1;
   auto buff_ptr = std::unique_ptr<char[]>(new char[size]);
   // 2015-10-08: vsnprintf is used instead of snd::vsnprintf due to a limitation
   // in the android-ndk
-  ret = vsnprintf(buff_ptr.get(), size, msg, args);
+  vsnprintf(buff_ptr.get(), size, msg, args);
   return std::string(buff_ptr.get());
 }
 
+}  // end namespace
+
+std::string HumanReadableNumber(double n, Counter::OneK one_k) {
+  return ToBinaryStringFullySpecified(n, 1, one_k);
+}
+
 std::string StrFormat(const char* format, ...) {
   va_list args;
   va_start(args, format);
@@ -160,6 +152,19 @@ std::string StrFormat(const char* format, ...) {
   return tmp;
 }
 
+std::vector<std::string> StrSplit(const std::string& str, char delim) {
+  if (str.empty()) return {};
+  std::vector<std::string> ret;
+  size_t first = 0;
+  size_t next = str.find(delim);
+  for (; next != std::string::npos;
+       first = next + 1, next = str.find(delim, first)) {
+    ret.push_back(str.substr(first, next - first));
+  }
+  ret.push_back(str.substr(first));
+  return ret;
+}
+
 #ifdef BENCHMARK_STL_ANDROID_GNUSTL
 /*
  * GNU STL in Android NDK lacks support for some C++11 functions, including
@@ -182,11 +187,10 @@ unsigned long stoul(const std::string& str, size_t* pos, int base) {
 
   /* Check for errors and return */
   if (strtoulErrno == ERANGE) {
-    throw std::out_of_range(
-      "stoul failed: " + str + " is outside of range of unsigned long");
+    throw std::out_of_range("stoul failed: " + str +
+                            " is outside of range of unsigned long");
   } else if (strEnd == strStart || strtoulErrno != 0) {
-    throw std::invalid_argument(
-      "stoul failed: " + str + " is not an integer");
+    throw std::invalid_argument("stoul failed: " + str + " is not an integer");
   }
   if (pos != nullptr) {
     *pos = static_cast<size_t>(strEnd - strStart);
@@ -209,11 +213,10 @@ int stoi(const std::string& str, size_t* pos, int base) {
 
   /* Check for errors and return */
   if (strtolErrno == ERANGE || long(int(result)) != result) {
-    throw std::out_of_range(
-      "stoul failed: " + str + " is outside of range of int");
+    throw std::out_of_range("stoul failed: " + str +
+                            " is outside of range of int");
   } else if (strEnd == strStart || strtolErrno != 0) {
-    throw std::invalid_argument(
-      "stoul failed: " + str + " is not an integer");
+    throw std::invalid_argument("stoul failed: " + str + " is not an integer");
   }
   if (pos != nullptr) {
     *pos = static_cast<size_t>(strEnd - strStart);
@@ -236,11 +239,10 @@ double stod(const std::string& str, size_t* pos) {
 
   /* Check for errors and return */
   if (strtodErrno == ERANGE) {
-    throw std::out_of_range(
-      "stoul failed: " + str + " is outside of range of int");
+    throw std::out_of_range("stoul failed: " + str +
+                            " is outside of range of int");
   } else if (strEnd == strStart || strtodErrno != 0) {
-    throw std::invalid_argument(
-      "stoul failed: " + str + " is not an integer");
+    throw std::invalid_argument("stoul failed: " + str + " is not an integer");
   }
   if (pos != nullptr) {
     *pos = static_cast<size_t>(strEnd - strStart);
diff --git a/ThirdParty/googlebenchmark/src/string_util.h b/ThirdParty/googlebenchmark/src/string_util.h
index 09d7b4bd2a..731aa2c04c 100644
--- a/ThirdParty/googlebenchmark/src/string_util.h
+++ b/ThirdParty/googlebenchmark/src/string_util.h
@@ -4,14 +4,19 @@
 #include <sstream>
 #include <string>
 #include <utility>
+#include <vector>
+
+#include "benchmark/benchmark.h"
+#include "benchmark/export.h"
+#include "check.h"
 #include "internal_macros.h"
 
 namespace benchmark {
 
-void AppendHumanReadable(int n, std::string* str);
-
-std::string HumanReadableNumber(double n, double one_k = 1024.0);
+BENCHMARK_EXPORT
+std::string HumanReadableNumber(double n, Counter::OneK one_k);
 
+BENCHMARK_EXPORT
 #if defined(__MINGW32__)
 __attribute__((format(__MINGW_PRINTF_FORMAT, 1, 2)))
 #elif defined(__GNUC__)
@@ -37,6 +42,11 @@ inline std::string StrCat(Args&&... args) {
   return ss.str();
 }
 
+BENCHMARK_EXPORT
+std::vector<std::string> StrSplit(const std::string& str, char delim);
+
+// Disable lint checking for this block since it re-implements C functions.
+// NOLINTBEGIN
 #ifdef BENCHMARK_STL_ANDROID_GNUSTL
 /*
  * GNU STL in Android NDK lacks support for some C++11 functions, including
@@ -45,14 +55,15 @@ inline std::string StrCat(Args&&... args) {
  * namespace, not std:: namespace.
  */
 unsigned long stoul(const std::string& str, size_t* pos = nullptr,
-                           int base = 10);
+                    int base = 10);
 int stoi(const std::string& str, size_t* pos = nullptr, int base = 10);
 double stod(const std::string& str, size_t* pos = nullptr);
 #else
-using std::stoul;
-using std::stoi;
-using std::stod;
+using std::stod;   // NOLINT(misc-unused-using-decls)
+using std::stoi;   // NOLINT(misc-unused-using-decls)
+using std::stoul;  // NOLINT(misc-unused-using-decls)
 #endif
+// NOLINTEND
 
 }  // end namespace benchmark
 
diff --git a/ThirdParty/googlebenchmark/src/sysinfo.cc b/ThirdParty/googlebenchmark/src/sysinfo.cc
index 28126470ba..922e83ac92 100644
--- a/ThirdParty/googlebenchmark/src/sysinfo.cc
+++ b/ThirdParty/googlebenchmark/src/sysinfo.cc
@@ -19,27 +19,36 @@
 #undef StrCat  // Don't let StrCat in string_util.h be renamed to lstrcatA
 #include <versionhelpers.h>
 #include <windows.h>
+
 #include <codecvt>
 #else
 #include <fcntl.h>
-#ifndef BENCHMARK_OS_FUCHSIA
+#if !defined(BENCHMARK_OS_FUCHSIA) && !defined(BENCHMARK_OS_QURT)
 #include <sys/resource.h>
 #endif
 #include <sys/time.h>
 #include <sys/types.h>  // this header must be included before 'sys/sysctl.h' to avoid compilation error on FreeBSD
 #include <unistd.h>
 #if defined BENCHMARK_OS_FREEBSD || defined BENCHMARK_OS_MACOSX || \
-    defined BENCHMARK_OS_NETBSD || defined BENCHMARK_OS_OPENBSD
+    defined BENCHMARK_OS_NETBSD || defined BENCHMARK_OS_OPENBSD || \
+    defined BENCHMARK_OS_DRAGONFLY
 #define BENCHMARK_HAS_SYSCTL
 #include <sys/sysctl.h>
 #endif
 #endif
 #if defined(BENCHMARK_OS_SOLARIS)
 #include <kstat.h>
+#include <netdb.h>
 #endif
 #if defined(BENCHMARK_OS_QNX)
 #include <sys/syspage.h>
 #endif
+#if defined(BENCHMARK_OS_QURT)
+#include <qurt.h>
+#endif
+#if defined(BENCHMARK_HAS_PTHREAD_AFFINITY)
+#include <pthread.h>
+#endif
 
 #include <algorithm>
 #include <array>
@@ -54,16 +63,19 @@
 #include <iostream>
 #include <iterator>
 #include <limits>
+#include <locale>
 #include <memory>
+#include <random>
 #include <sstream>
-#include <locale>
+#include <utility>
 
+#include "benchmark/benchmark.h"
 #include "check.h"
 #include "cycleclock.h"
 #include "internal_macros.h"
 #include "log.h"
-#include "sleep.h"
 #include "string_util.h"
+#include "timers.h"
 
 namespace benchmark {
 namespace {
@@ -88,67 +100,59 @@ BENCHMARK_NORETURN void PrintErrorAndDie(Args&&... args) {
 /// `sysctl` with the result type it's to be interpreted as.
 struct ValueUnion {
   union DataT {
-    uint32_t uint32_value;
-    uint64_t uint64_value;
+    int32_t int32_value;
+    int64_t int64_value;
     // For correct aliasing of union members from bytes.
     char bytes[8];
   };
   using DataPtr = std::unique_ptr<DataT, decltype(&std::free)>;
 
   // The size of the data union member + its trailing array size.
-  size_t Size;
-  DataPtr Buff;
+  std::size_t size;
+  DataPtr buff;
 
  public:
-  ValueUnion() : Size(0), Buff(nullptr, &std::free) {}
+  ValueUnion() : size(0), buff(nullptr, &std::free) {}
 
-  explicit ValueUnion(size_t BuffSize)
-      : Size(sizeof(DataT) + BuffSize),
-        Buff(::new (std::malloc(Size)) DataT(), &std::free) {}
+  explicit ValueUnion(std::size_t buff_size)
+      : size(sizeof(DataT) + buff_size),
+        buff(::new (std::malloc(size)) DataT(), &std::free) {}
 
   ValueUnion(ValueUnion&& other) = default;
 
-  explicit operator bool() const { return bool(Buff); }
+  explicit operator bool() const { return bool(buff); }
 
-  char* data() const { return Buff->bytes; }
+  char* data() const { return buff->bytes; }
 
   std::string GetAsString() const { return std::string(data()); }
 
   int64_t GetAsInteger() const {
-    if (Size == sizeof(Buff->uint32_value))
-      return static_cast<int32_t>(Buff->uint32_value);
-    else if (Size == sizeof(Buff->uint64_value))
-      return static_cast<int64_t>(Buff->uint64_value);
-    BENCHMARK_UNREACHABLE();
-  }
-
-  uint64_t GetAsUnsigned() const {
-    if (Size == sizeof(Buff->uint32_value))
-      return Buff->uint32_value;
-    else if (Size == sizeof(Buff->uint64_value))
-      return Buff->uint64_value;
+    if (size == sizeof(buff->int32_value))
+      return buff->int32_value;
+    else if (size == sizeof(buff->int64_value))
+      return buff->int64_value;
     BENCHMARK_UNREACHABLE();
   }
 
   template <class T, int N>
   std::array<T, N> GetAsArray() {
-    const int ArrSize = sizeof(T) * N;
-    CHECK_LE(ArrSize, Size);
-    std::array<T, N> Arr;
-    std::memcpy(Arr.data(), data(), ArrSize);
-    return Arr;
+    const int arr_size = sizeof(T) * N;
+    BM_CHECK_LE(arr_size, size);
+    std::array<T, N> arr;
+    std::memcpy(arr.data(), data(), arr_size);
+    return arr;
   }
 };
 
-ValueUnion GetSysctlImp(std::string const& Name) {
+ValueUnion GetSysctlImp(std::string const& name) {
 #if defined BENCHMARK_OS_OPENBSD
   int mib[2];
 
   mib[0] = CTL_HW;
-  if ((Name == "hw.ncpu") || (Name == "hw.cpuspeed")){
+  if ((name == "hw.ncpu") || (name == "hw.cpuspeed")) {
     ValueUnion buff(sizeof(int));
 
-    if (Name == "hw.ncpu") {
+    if (name == "hw.ncpu") {
       mib[1] = HW_NCPU;
     } else {
       mib[1] = HW_CPUSPEED;
@@ -161,41 +165,41 @@ ValueUnion GetSysctlImp(std::string const& Name) {
   }
   return ValueUnion();
 #else
-  size_t CurBuffSize = 0;
-  if (sysctlbyname(Name.c_str(), nullptr, &CurBuffSize, nullptr, 0) == -1)
+  std::size_t cur_buff_size = 0;
+  if (sysctlbyname(name.c_str(), nullptr, &cur_buff_size, nullptr, 0) == -1)
     return ValueUnion();
 
-  ValueUnion buff(CurBuffSize);
-  if (sysctlbyname(Name.c_str(), buff.data(), &buff.Size, nullptr, 0) == 0)
+  ValueUnion buff(cur_buff_size);
+  if (sysctlbyname(name.c_str(), buff.data(), &buff.size, nullptr, 0) == 0)
     return buff;
   return ValueUnion();
 #endif
 }
 
 BENCHMARK_MAYBE_UNUSED
-bool GetSysctl(std::string const& Name, std::string* Out) {
-  Out->clear();
-  auto Buff = GetSysctlImp(Name);
-  if (!Buff) return false;
-  Out->assign(Buff.data());
+bool GetSysctl(std::string const& name, std::string* out) {
+  out->clear();
+  auto buff = GetSysctlImp(name);
+  if (!buff) return false;
+  out->assign(buff.data());
   return true;
 }
 
 template <class Tp,
           class = typename std::enable_if<std::is_integral<Tp>::value>::type>
-bool GetSysctl(std::string const& Name, Tp* Out) {
-  *Out = 0;
-  auto Buff = GetSysctlImp(Name);
-  if (!Buff) return false;
-  *Out = static_cast<Tp>(Buff.GetAsUnsigned());
+bool GetSysctl(std::string const& name, Tp* out) {
+  *out = 0;
+  auto buff = GetSysctlImp(name);
+  if (!buff) return false;
+  *out = static_cast<Tp>(buff.GetAsInteger());
   return true;
 }
 
 template <class Tp, size_t N>
-bool GetSysctl(std::string const& Name, std::array<Tp, N>* Out) {
-  auto Buff = GetSysctlImp(Name);
-  if (!Buff) return false;
-  *Out = Buff.GetAsArray<Tp, N>();
+bool GetSysctl(std::string const& name, std::array<Tp, N>* out) {
+  auto buff = GetSysctlImp(name);
+  if (!buff) return false;
+  *out = buff.GetAsArray<Tp, N>();
   return true;
 }
 #endif
@@ -209,13 +213,12 @@ bool ReadFromFile(std::string const& fname, ArgT* arg) {
   return f.good();
 }
 
-bool CpuScalingEnabled(int num_cpus) {
+CPUInfo::Scaling CpuScaling(int num_cpus) {
   // We don't have a valid CPU count, so don't even bother.
-  if (num_cpus <= 0) return false;
-#ifdef BENCHMARK_OS_QNX
-  return false;
-#endif
-#ifndef BENCHMARK_OS_WINDOWS
+  if (num_cpus <= 0) return CPUInfo::Scaling::UNKNOWN;
+#if defined(BENCHMARK_OS_QNX)
+  return CPUInfo::Scaling::UNKNOWN;
+#elif !defined(BENCHMARK_OS_WINDOWS)
   // On Linux, the CPUfreq subsystem exposes CPU information as files on the
   // local file system. If reading the exported files fails, then we may not be
   // running on Linux, so we silently ignore all the read errors.
@@ -223,27 +226,30 @@ bool CpuScalingEnabled(int num_cpus) {
   for (int cpu = 0; cpu < num_cpus; ++cpu) {
     std::string governor_file =
         StrCat("/sys/devices/system/cpu/cpu", cpu, "/cpufreq/scaling_governor");
-    if (ReadFromFile(governor_file, &res) && res != "performance") return true;
+    if (ReadFromFile(governor_file, &res) && res != "performance")
+      return CPUInfo::Scaling::ENABLED;
   }
+  return CPUInfo::Scaling::DISABLED;
+#else
+  return CPUInfo::Scaling::UNKNOWN;
 #endif
-  return false;
 }
 
-int CountSetBitsInCPUMap(std::string Val) {
-  auto CountBits = [](std::string Part) {
+int CountSetBitsInCPUMap(std::string val) {
+  auto CountBits = [](std::string part) {
     using CPUMask = std::bitset<sizeof(std::uintptr_t) * CHAR_BIT>;
-    Part = "0x" + Part;
-    CPUMask Mask(benchmark::stoul(Part, nullptr, 16));
-    return static_cast<int>(Mask.count());
+    part = "0x" + part;
+    CPUMask mask(benchmark::stoul(part, nullptr, 16));
+    return static_cast<int>(mask.count());
   };
-  size_t Pos;
+  std::size_t pos;
   int total = 0;
-  while ((Pos = Val.find(',')) != std::string::npos) {
-    total += CountBits(Val.substr(0, Pos));
-    Val = Val.substr(Pos + 1);
+  while ((pos = val.find(',')) != std::string::npos) {
+    total += CountBits(val.substr(0, pos));
+    val = val.substr(pos + 1);
   }
-  if (!Val.empty()) {
-    total += CountBits(Val);
+  if (!val.empty()) {
+    total += CountBits(val);
   }
   return total;
 }
@@ -252,16 +258,16 @@ BENCHMARK_MAYBE_UNUSED
 std::vector<CPUInfo::CacheInfo> GetCacheSizesFromKVFS() {
   std::vector<CPUInfo::CacheInfo> res;
   std::string dir = "/sys/devices/system/cpu/cpu0/cache/";
-  int Idx = 0;
+  int idx = 0;
   while (true) {
     CPUInfo::CacheInfo info;
-    std::string FPath = StrCat(dir, "index", Idx++, "/");
-    std::ifstream f(StrCat(FPath, "size").c_str());
+    std::string fpath = StrCat(dir, "index", idx++, "/");
+    std::ifstream f(StrCat(fpath, "size").c_str());
     if (!f.is_open()) break;
     std::string suffix;
     f >> info.size;
     if (f.fail())
-      PrintErrorAndDie("Failed while reading file '", FPath, "size'");
+      PrintErrorAndDie("Failed while reading file '", fpath, "size'");
     if (f.good()) {
       f >> suffix;
       if (f.bad())
@@ -270,15 +276,15 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizesFromKVFS() {
       else if (f && suffix != "K")
         PrintErrorAndDie("Invalid cache size format: Expected bytes ", suffix);
       else if (suffix == "K")
-        info.size *= 1000;
+        info.size *= 1024;
     }
-    if (!ReadFromFile(StrCat(FPath, "type"), &info.type))
-      PrintErrorAndDie("Failed to read from file ", FPath, "type");
-    if (!ReadFromFile(StrCat(FPath, "level"), &info.level))
-      PrintErrorAndDie("Failed to read from file ", FPath, "level");
+    if (!ReadFromFile(StrCat(fpath, "type"), &info.type))
+      PrintErrorAndDie("Failed to read from file ", fpath, "type");
+    if (!ReadFromFile(StrCat(fpath, "level"), &info.level))
+      PrintErrorAndDie("Failed to read from file ", fpath, "level");
     std::string map_str;
-    if (!ReadFromFile(StrCat(FPath, "shared_cpu_map"), &map_str))
-      PrintErrorAndDie("Failed to read from file ", FPath, "shared_cpu_map");
+    if (!ReadFromFile(StrCat(fpath, "shared_cpu_map"), &map_str))
+      PrintErrorAndDie("Failed to read from file ", fpath, "shared_cpu_map");
     info.num_sharing = CountSetBitsInCPUMap(map_str);
     res.push_back(info);
   }
@@ -289,26 +295,26 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizesFromKVFS() {
 #ifdef BENCHMARK_OS_MACOSX
 std::vector<CPUInfo::CacheInfo> GetCacheSizesMacOSX() {
   std::vector<CPUInfo::CacheInfo> res;
-  std::array<uint64_t, 4> CacheCounts{{0, 0, 0, 0}};
-  GetSysctl("hw.cacheconfig", &CacheCounts);
+  std::array<int, 4> cache_counts{{0, 0, 0, 0}};
+  GetSysctl("hw.cacheconfig", &cache_counts);
 
   struct {
     std::string name;
     std::string type;
     int level;
-    uint64_t num_sharing;
-  } Cases[] = {{"hw.l1dcachesize", "Data", 1, CacheCounts[1]},
-               {"hw.l1icachesize", "Instruction", 1, CacheCounts[1]},
-               {"hw.l2cachesize", "Unified", 2, CacheCounts[2]},
-               {"hw.l3cachesize", "Unified", 3, CacheCounts[3]}};
-  for (auto& C : Cases) {
+    int num_sharing;
+  } cases[] = {{"hw.l1dcachesize", "Data", 1, cache_counts[1]},
+               {"hw.l1icachesize", "Instruction", 1, cache_counts[1]},
+               {"hw.l2cachesize", "Unified", 2, cache_counts[2]},
+               {"hw.l3cachesize", "Unified", 3, cache_counts[3]}};
+  for (auto& c : cases) {
     int val;
-    if (!GetSysctl(C.name, &val)) continue;
+    if (!GetSysctl(c.name, &val)) continue;
     CPUInfo::CacheInfo info;
-    info.type = C.type;
-    info.level = C.level;
+    info.type = c.type;
+    info.level = c.level;
     info.size = val;
-    info.num_sharing = static_cast<int>(C.num_sharing);
+    info.num_sharing = c.num_sharing;
     res.push_back(std::move(info));
   }
   return res;
@@ -322,7 +328,7 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizesWindows() {
 
   using UPtr = std::unique_ptr<PInfo, decltype(&std::free)>;
   GetLogicalProcessorInformation(nullptr, &buffer_size);
-  UPtr buff((PInfo*)malloc(buffer_size), &std::free);
+  UPtr buff(static_cast<PInfo*>(std::malloc(buffer_size)), &std::free);
   if (!GetLogicalProcessorInformation(buff.get(), &buffer_size))
     PrintErrorAndDie("Failed during call to GetLogicalProcessorInformation: ",
                      GetLastError());
@@ -333,15 +339,16 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizesWindows() {
   for (; it != end; ++it) {
     if (it->Relationship != RelationCache) continue;
     using BitSet = std::bitset<sizeof(ULONG_PTR) * CHAR_BIT>;
-    BitSet B(it->ProcessorMask);
+    BitSet b(it->ProcessorMask);
     // To prevent duplicates, only consider caches where CPU 0 is specified
-    if (!B.test(0)) continue;
-    CInfo* Cache = &it->Cache;
+    if (!b.test(0)) continue;
+    const CInfo& cache = it->Cache;
     CPUInfo::CacheInfo C;
-    C.num_sharing = static_cast<int>(B.count());
-    C.level = Cache->Level;
-    C.size = Cache->Size;
-    switch (Cache->Type) {
+    C.num_sharing = static_cast<int>(b.count());
+    C.level = cache.Level;
+    C.size = cache.Size;
+    C.type = "Unknown";
+    switch (cache.Type) {
       case CacheUnified:
         C.type = "Unified";
         break;
@@ -354,9 +361,6 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizesWindows() {
       case CacheTrace:
         C.type = "Trace";
         break;
-      default:
-        C.type = "Unknown";
-        break;
     }
     res.push_back(C);
   }
@@ -365,27 +369,29 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizesWindows() {
 #elif BENCHMARK_OS_QNX
 std::vector<CPUInfo::CacheInfo> GetCacheSizesQNX() {
   std::vector<CPUInfo::CacheInfo> res;
-  struct cacheattr_entry *cache = SYSPAGE_ENTRY(cacheattr);
+  struct cacheattr_entry* cache = SYSPAGE_ENTRY(cacheattr);
   uint32_t const elsize = SYSPAGE_ELEMENT_SIZE(cacheattr);
-  int num = SYSPAGE_ENTRY_SIZE(cacheattr) / elsize ;
-  for(int i = 0; i < num; ++i ) {
+  int num = SYSPAGE_ENTRY_SIZE(cacheattr) / elsize;
+  for (int i = 0; i < num; ++i) {
     CPUInfo::CacheInfo info;
-    switch (cache->flags){
-      case CACHE_FLAG_INSTR :
+    switch (cache->flags) {
+      case CACHE_FLAG_INSTR:
         info.type = "Instruction";
         info.level = 1;
         break;
-      case CACHE_FLAG_DATA :
+      case CACHE_FLAG_DATA:
         info.type = "Data";
         info.level = 1;
         break;
-      case CACHE_FLAG_UNIFIED :
+      case CACHE_FLAG_UNIFIED:
         info.type = "Unified";
         info.level = 2;
-      case CACHE_FLAG_SHARED :
+        break;
+      case CACHE_FLAG_SHARED:
         info.type = "Shared";
         info.level = 3;
-      default :
+        break;
+      default:
         continue;
         break;
     }
@@ -405,6 +411,8 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizes() {
   return GetCacheSizesWindows();
 #elif defined(BENCHMARK_OS_QNX)
   return GetCacheSizesQNX();
+#elif defined(BENCHMARK_OS_QURT)
+  return std::vector<CPUInfo::CacheInfo>();
 #else
   return GetCacheSizesFromKVFS();
 #endif
@@ -413,38 +421,57 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizes() {
 std::string GetSystemName() {
 #if defined(BENCHMARK_OS_WINDOWS)
   std::string str;
-  const unsigned COUNT = MAX_COMPUTERNAME_LENGTH+1;
-  TCHAR  hostname[COUNT] = {'\0'};
+  static constexpr int COUNT = MAX_COMPUTERNAME_LENGTH + 1;
+  TCHAR hostname[COUNT] = {'\0'};
   DWORD DWCOUNT = COUNT;
-  if (!GetComputerName(hostname, &DWCOUNT))
-    return std::string("");
+  if (!GetComputerName(hostname, &DWCOUNT)) return std::string("");
 #ifndef UNICODE
   str = std::string(hostname, DWCOUNT);
 #else
-  //Using wstring_convert, Is deprecated in C++17
-  using convert_type = std::codecvt_utf8<wchar_t>;
-  std::wstring_convert<convert_type, wchar_t> converter;
-  std::wstring wStr(hostname, DWCOUNT);
-  str = converter.to_bytes(wStr);
+  // `WideCharToMultiByte` returns `0` when conversion fails.
+  int len = WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS, hostname,
+                                DWCOUNT, NULL, 0, NULL, NULL);
+  str.resize(len);
+  WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS, hostname, DWCOUNT, &str[0],
+                      str.size(), NULL, NULL);
 #endif
   return str;
-#else // defined(BENCHMARK_OS_WINDOWS)
-#ifdef BENCHMARK_HAS_SYSCTL // BSD/Mac Doesnt have HOST_NAME_MAX defined
+#elif defined(BENCHMARK_OS_QURT)
+  std::string str = "Hexagon DSP";
+  qurt_arch_version_t arch_version_struct;
+  if (qurt_sysenv_get_arch_version(&arch_version_struct) == QURT_EOK) {
+    str += " v";
+    str += std::to_string(arch_version_struct.arch_version);
+  }
+  return str;
+#else
+#ifndef HOST_NAME_MAX
+#ifdef BENCHMARK_HAS_SYSCTL  // BSD/Mac doesn't have HOST_NAME_MAX defined
+#define HOST_NAME_MAX 64
+#elif defined(BENCHMARK_OS_NACL)
 #define HOST_NAME_MAX 64
 #elif defined(BENCHMARK_OS_QNX)
 #define HOST_NAME_MAX 154
+#elif defined(BENCHMARK_OS_RTEMS)
+#define HOST_NAME_MAX 256
+#elif defined(BENCHMARK_OS_SOLARIS)
+#define HOST_NAME_MAX MAXHOSTNAMELEN
+#else
+#pragma message("HOST_NAME_MAX not defined. using 64")
+#define HOST_NAME_MAX 64
 #endif
+#endif  // def HOST_NAME_MAX
   char hostname[HOST_NAME_MAX];
   int retVal = gethostname(hostname, HOST_NAME_MAX);
   if (retVal != 0) return std::string("");
   return std::string(hostname);
-#endif // Catch-all POSIX block.
+#endif  // Catch-all POSIX block.
 }
 
 int GetNumCPUs() {
 #ifdef BENCHMARK_HAS_SYSCTL
-  int NumCPU = -1;
-  if (GetSysctl("hw.ncpu", &NumCPU)) return NumCPU;
+  int num_cpu = -1;
+  if (GetSysctl("hw.ncpu", &num_cpu)) return num_cpu;
   fprintf(stderr, "Err: %s\n", strerror(errno));
   std::exit(EXIT_FAILURE);
 #elif defined(BENCHMARK_OS_WINDOWS)
@@ -458,18 +485,23 @@ int GetNumCPUs() {
                                         // group
 #elif defined(BENCHMARK_OS_SOLARIS)
   // Returns -1 in case of a failure.
-  int NumCPU = sysconf(_SC_NPROCESSORS_ONLN);
-  if (NumCPU < 0) {
-    fprintf(stderr,
-            "sysconf(_SC_NPROCESSORS_ONLN) failed with error: %s\n",
+  long num_cpu = sysconf(_SC_NPROCESSORS_ONLN);
+  if (num_cpu < 0) {
+    fprintf(stderr, "sysconf(_SC_NPROCESSORS_ONLN) failed with error: %s\n",
             strerror(errno));
   }
-  return NumCPU;
+  return (int)num_cpu;
 #elif defined(BENCHMARK_OS_QNX)
   return static_cast<int>(_syspage_ptr->num_cpu);
+#elif defined(BENCHMARK_OS_QURT)
+  qurt_sysenv_max_hthreads_t hardware_threads;
+  if (qurt_sysenv_get_max_hw_threads(&hardware_threads) != QURT_EOK) {
+    hardware_threads.max_hthreads = 1;
+  }
+  return hardware_threads.max_hthreads;
 #else
-  int NumCPUs = 0;
-  int MaxID = -1;
+  int num_cpus = 0;
+  int max_id = -1;
   std::ifstream f("/proc/cpuinfo");
   if (!f.is_open()) {
     std::cerr << "failed to open /proc/cpuinfo\n";
@@ -479,20 +511,21 @@ int GetNumCPUs() {
   std::string ln;
   while (std::getline(f, ln)) {
     if (ln.empty()) continue;
-    size_t SplitIdx = ln.find(':');
+    std::size_t split_idx = ln.find(':');
     std::string value;
 #if defined(__s390__)
     // s390 has another format in /proc/cpuinfo
     // it needs to be parsed differently
-    if (SplitIdx != std::string::npos) value = ln.substr(Key.size()+1,SplitIdx-Key.size()-1);
+    if (split_idx != std::string::npos)
+      value = ln.substr(Key.size() + 1, split_idx - Key.size() - 1);
 #else
-    if (SplitIdx != std::string::npos) value = ln.substr(SplitIdx + 1);
+    if (split_idx != std::string::npos) value = ln.substr(split_idx + 1);
 #endif
     if (ln.size() >= Key.size() && ln.compare(0, Key.size(), Key) == 0) {
-      NumCPUs++;
+      num_cpus++;
       if (!value.empty()) {
-        int CurID = benchmark::stoi(value);
-        MaxID = std::max(CurID, MaxID);
+        const int cur_id = benchmark::stoi(value);
+        max_id = std::max(cur_id, max_id);
       }
     }
   }
@@ -506,17 +539,95 @@ int GetNumCPUs() {
   }
   f.close();
 
-  if ((MaxID + 1) != NumCPUs) {
+  if ((max_id + 1) != num_cpus) {
     fprintf(stderr,
             "CPU ID assignments in /proc/cpuinfo seem messed up."
             " This is usually caused by a bad BIOS.\n");
   }
-  return NumCPUs;
+  return num_cpus;
 #endif
   BENCHMARK_UNREACHABLE();
 }
 
-double GetCPUCyclesPerSecond() {
+class ThreadAffinityGuard final {
+ public:
+  ThreadAffinityGuard() : reset_affinity(SetAffinity()) {
+    if (!reset_affinity)
+      std::cerr << "***WARNING*** Failed to set thread affinity. Estimated CPU "
+                   "frequency may be incorrect."
+                << std::endl;
+  }
+
+  ~ThreadAffinityGuard() {
+    if (!reset_affinity) return;
+
+#if defined(BENCHMARK_HAS_PTHREAD_AFFINITY)
+    int ret = pthread_setaffinity_np(self, sizeof(previous_affinity),
+                                     &previous_affinity);
+    if (ret == 0) return;
+#elif defined(BENCHMARK_OS_WINDOWS_WIN32)
+    DWORD_PTR ret = SetThreadAffinityMask(self, previous_affinity);
+    if (ret != 0) return;
+#endif  // def BENCHMARK_HAS_PTHREAD_AFFINITY
+    PrintErrorAndDie("Failed to reset thread affinity");
+  }
+
+  ThreadAffinityGuard(ThreadAffinityGuard&&) = delete;
+  ThreadAffinityGuard(const ThreadAffinityGuard&) = delete;
+  ThreadAffinityGuard& operator=(ThreadAffinityGuard&&) = delete;
+  ThreadAffinityGuard& operator=(const ThreadAffinityGuard&) = delete;
+
+ private:
+  bool SetAffinity() {
+#if defined(BENCHMARK_HAS_PTHREAD_AFFINITY)
+    int ret;
+    self = pthread_self();
+    ret = pthread_getaffinity_np(self, sizeof(previous_affinity),
+                                 &previous_affinity);
+    if (ret != 0) return false;
+
+    cpu_set_t affinity;
+    memcpy(&affinity, &previous_affinity, sizeof(affinity));
+
+    bool is_first_cpu = true;
+
+    for (int i = 0; i < CPU_SETSIZE; ++i)
+      if (CPU_ISSET(i, &affinity)) {
+        if (is_first_cpu)
+          is_first_cpu = false;
+        else
+          CPU_CLR(i, &affinity);
+      }
+
+    if (is_first_cpu) return false;
+
+    ret = pthread_setaffinity_np(self, sizeof(affinity), &affinity);
+    return ret == 0;
+#elif defined(BENCHMARK_OS_WINDOWS_WIN32)
+    self = GetCurrentThread();
+    DWORD_PTR mask = static_cast<DWORD_PTR>(1) << GetCurrentProcessorNumber();
+    previous_affinity = SetThreadAffinityMask(self, mask);
+    return previous_affinity != 0;
+#else
+    return false;
+#endif  // def BENCHMARK_HAS_PTHREAD_AFFINITY
+  }
+
+#if defined(BENCHMARK_HAS_PTHREAD_AFFINITY)
+  pthread_t self;
+  cpu_set_t previous_affinity;
+#elif defined(BENCHMARK_OS_WINDOWS_WIN32)
+  HANDLE self;
+  DWORD_PTR previous_affinity;
+#endif  // def BENCHMARK_HAS_PTHREAD_AFFINITY
+  bool reset_affinity;
+};
+
+double GetCPUCyclesPerSecond(CPUInfo::Scaling scaling) {
+  // Currently, scaling is only used on linux path here,
+  // suppress diagnostics about it being unused on other paths.
+  (void)scaling;
+
 #if defined BENCHMARK_OS_LINUX || defined BENCHMARK_OS_CYGWIN
   long freq;
 
@@ -527,8 +638,15 @@ double GetCPUCyclesPerSecond() {
   // cannot always be relied upon. The same reasons apply to /proc/cpuinfo as
   // well.
   if (ReadFromFile("/sys/devices/system/cpu/cpu0/tsc_freq_khz", &freq)
-      // If CPU scaling is in effect, we want to use the *maximum* frequency,
-      // not whatever CPU speed some random processor happens to be using now.
+      // If CPU scaling is disabled, use the *current* frequency.
+      // Note that we specifically don't want to read cpuinfo_cur_freq,
+      // because it is only readable by root.
+      || (scaling == CPUInfo::Scaling::DISABLED &&
+          ReadFromFile("/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq",
+                       &freq))
+      // Otherwise, if CPU scaling may be in effect, we want to use
+      // the *maximum* frequency, not whatever CPU speed some random processor
+      // happens to be using now.
       || ReadFromFile("/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq",
                       &freq)) {
     // The value is in kHz (as the file name suggests).  For example, on a
@@ -545,7 +663,7 @@ double GetCPUCyclesPerSecond() {
     return error_value;
   }
 
-  auto startsWithKey = [](std::string const& Value, std::string const& Key) {
+  auto StartsWithKey = [](std::string const& Value, std::string const& Key) {
     if (Key.size() > Value.size()) return false;
     auto Cmp = [&](char X, char Y) {
       return std::tolower(X) == std::tolower(Y);
@@ -556,18 +674,18 @@ double GetCPUCyclesPerSecond() {
   std::string ln;
   while (std::getline(f, ln)) {
     if (ln.empty()) continue;
-    size_t SplitIdx = ln.find(':');
+    std::size_t split_idx = ln.find(':');
     std::string value;
-    if (SplitIdx != std::string::npos) value = ln.substr(SplitIdx + 1);
+    if (split_idx != std::string::npos) value = ln.substr(split_idx + 1);
     // When parsing the "cpu MHz" and "bogomips" (fallback) entries, we only
     // accept positive values. Some environments (virtual machines) report zero,
     // which would cause infinite looping in WallTime_Init.
-    if (startsWithKey(ln, "cpu MHz")) {
+    if (StartsWithKey(ln, "cpu MHz")) {
       if (!value.empty()) {
         double cycles_per_second = benchmark::stod(value) * 1000000.0;
         if (cycles_per_second > 0) return cycles_per_second;
       }
-    } else if (startsWithKey(ln, "bogomips")) {
+    } else if (StartsWithKey(ln, "bogomips")) {
       if (!value.empty()) {
         bogo_clock = benchmark::stod(value) * 1000000.0;
         if (bogo_clock < 0.0) bogo_clock = error_value;
@@ -589,24 +707,29 @@ double GetCPUCyclesPerSecond() {
   if (bogo_clock >= 0.0) return bogo_clock;
 
 #elif defined BENCHMARK_HAS_SYSCTL
-  constexpr auto* FreqStr =
+  constexpr auto* freqStr =
 #if defined(BENCHMARK_OS_FREEBSD) || defined(BENCHMARK_OS_NETBSD)
       "machdep.tsc_freq";
 #elif defined BENCHMARK_OS_OPENBSD
       "hw.cpuspeed";
+#elif defined BENCHMARK_OS_DRAGONFLY
+      "hw.tsc_frequency";
 #else
       "hw.cpufrequency";
 #endif
   unsigned long long hz = 0;
 #if defined BENCHMARK_OS_OPENBSD
-  if (GetSysctl(FreqStr, &hz)) return hz * 1000000;
+  if (GetSysctl(freqStr, &hz)) return hz * 1000000;
 #else
-  if (GetSysctl(FreqStr, &hz)) return hz;
+  if (GetSysctl(freqStr, &hz)) return hz;
 #endif
   fprintf(stderr, "Unable to determine clock rate from sysctl: %s: %s\n",
-          FreqStr, strerror(errno));
+          freqStr, strerror(errno));
+  fprintf(stderr,
+          "This does not affect benchmark measurements, only the "
+          "metadata output.\n");
 
-#elif defined BENCHMARK_OS_WINDOWS
+#elif defined BENCHMARK_OS_WINDOWS_WIN32
   // In NT, read MHz from the registry. If we fail to do so or we're in win9x
   // then make a crude estimate.
   DWORD data, data_size = sizeof(data);
@@ -615,15 +738,16 @@ double GetCPUCyclesPerSecond() {
           SHGetValueA(HKEY_LOCAL_MACHINE,
                       "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0",
                       "~MHz", nullptr, &data, &data_size)))
-    return static_cast<double>((int64_t)data *
-                               (int64_t)(1000 * 1000));  // was mhz
-#elif defined (BENCHMARK_OS_SOLARIS)
-  kstat_ctl_t *kc = kstat_open();
+    return static_cast<double>(static_cast<int64_t>(data) *
+                               static_cast<int64_t>(1000 * 1000));  // was mhz
+#elif defined(BENCHMARK_OS_SOLARIS)
+  kstat_ctl_t* kc = kstat_open();
   if (!kc) {
     std::cerr << "failed to open /dev/kstat\n";
     return -1;
   }
-  kstat_t *ksp = kstat_lookup(kc, (char*)"cpu_info", -1, (char*)"cpu_info0");
+  kstat_t* ksp = kstat_lookup(kc, const_cast<char*>("cpu_info"), -1,
+                              const_cast<char*>("cpu_info0"));
   if (!ksp) {
     std::cerr << "failed to lookup in /dev/kstat\n";
     return -1;
@@ -632,8 +756,8 @@ double GetCPUCyclesPerSecond() {
     std::cerr << "failed to read from /dev/kstat\n";
     return -1;
   }
-  kstat_named_t *knp =
-      (kstat_named_t*)kstat_data_lookup(ksp, (char*)"current_clock_Hz");
+  kstat_named_t* knp = (kstat_named_t*)kstat_data_lookup(
+      ksp, const_cast<char*>("current_clock_Hz"));
   if (!knp) {
     std::cerr << "failed to lookup data in /dev/kstat\n";
     return -1;
@@ -646,22 +770,55 @@ double GetCPUCyclesPerSecond() {
   double clock_hz = knp->value.ui64;
   kstat_close(kc);
   return clock_hz;
-#elif defined (BENCHMARK_OS_QNX)
+#elif defined(BENCHMARK_OS_QNX)
   return static_cast<double>((int64_t)(SYSPAGE_ENTRY(cpuinfo)->speed) *
                              (int64_t)(1000 * 1000));
+#elif defined(BENCHMARK_OS_QURT)
+  // QuRT doesn't provide any API to query Hexagon frequency.
+  return 1000000000;
 #endif
   // If we've fallen through, attempt to roughly estimate the CPU clock rate.
-  const int estimate_time_ms = 1000;
+
+  // Make sure to use the same cycle counter when starting and stopping the
+  // cycle timer. We just pin the current thread to a cpu in the previous
+  // affinity set.
+  ThreadAffinityGuard affinity_guard;
+
+  static constexpr double estimate_time_s = 1.0;
+  const double start_time = ChronoClockNow();
   const auto start_ticks = cycleclock::Now();
-  SleepForMilliseconds(estimate_time_ms);
-  return static_cast<double>(cycleclock::Now() - start_ticks);
+
+  // Impose load instead of calling sleep() to make sure the cycle counter
+  // works.
+  using PRNG = std::minstd_rand;
+  using Result = PRNG::result_type;
+  PRNG rng(static_cast<Result>(start_ticks));
+
+  Result state = 0;
+
+  do {
+    static constexpr size_t batch_size = 10000;
+    rng.discard(batch_size);
+    state += rng();
+
+  } while (ChronoClockNow() - start_time < estimate_time_s);
+
+  DoNotOptimize(state);
+
+  const auto end_ticks = cycleclock::Now();
+  const double end_time = ChronoClockNow();
+
+  return static_cast<double>(end_ticks - start_ticks) / (end_time - start_time);
+  // Reset the affinity of current thread when the lifetime of affinity_guard
+  // ends.
 }
 
 std::vector<double> GetLoadAvg() {
-#if (defined BENCHMARK_OS_FREEBSD || defined(BENCHMARK_OS_LINUX) || \
-    defined BENCHMARK_OS_MACOSX || defined BENCHMARK_OS_NETBSD ||  \
-    defined BENCHMARK_OS_OPENBSD) && !defined(__ANDROID__)
-  constexpr int kMaxSamples = 3;
+#if (defined BENCHMARK_OS_FREEBSD || defined(BENCHMARK_OS_LINUX) ||     \
+     defined BENCHMARK_OS_MACOSX || defined BENCHMARK_OS_NETBSD ||      \
+     defined BENCHMARK_OS_OPENBSD || defined BENCHMARK_OS_DRAGONFLY) && \
+    !(defined(__ANDROID__) && __ANDROID_API__ < 29)
+  static constexpr int kMaxSamples = 3;
   std::vector<double> res(kMaxSamples, 0.0);
   const int nelem = getloadavg(res.data(), kMaxSamples);
   if (nelem < 1) {
@@ -684,12 +841,11 @@ const CPUInfo& CPUInfo::Get() {
 
 CPUInfo::CPUInfo()
     : num_cpus(GetNumCPUs()),
-      cycles_per_second(GetCPUCyclesPerSecond()),
+      scaling(CpuScaling(num_cpus)),
+      cycles_per_second(GetCPUCyclesPerSecond(scaling)),
       caches(GetCacheSizes()),
-      scaling_enabled(CpuScalingEnabled(num_cpus)),
       load_avg(GetLoadAvg()) {}
 
-
 const SystemInfo& SystemInfo::Get() {
   static const SystemInfo* info = new SystemInfo();
   return *info;
diff --git a/ThirdParty/googlebenchmark/src/thread_manager.h b/ThirdParty/googlebenchmark/src/thread_manager.h
index 1720281f0a..819b3c44db 100644
--- a/ThirdParty/googlebenchmark/src/thread_manager.h
+++ b/ThirdParty/googlebenchmark/src/thread_manager.h
@@ -11,7 +11,7 @@ namespace internal {
 
 class ThreadManager {
  public:
-  ThreadManager(int num_threads)
+  explicit ThreadManager(int num_threads)
       : alive_threads_(num_threads), start_stop_barrier_(num_threads) {}
 
   Mutex& GetBenchmarkMutex() const RETURN_CAPABILITY(benchmark_mutex_) {
@@ -36,7 +36,6 @@ class ThreadManager {
                         [this]() { return alive_threads_ == 0; });
   }
 
- public:
   struct Result {
     IterationCount iterations = 0;
     double real_time_used = 0;
@@ -44,8 +43,8 @@ class ThreadManager {
     double manual_time_used = 0;
     int64_t complexity_n = 0;
     std::string report_label_;
-    std::string error_message_;
-    bool has_error_ = false;
+    std::string skip_message_;
+    internal::Skipped skipped_ = internal::NotSkipped;
     UserCounters counters;
   };
   GUARDED_BY(GetBenchmarkMutex()) Result results;
diff --git a/ThirdParty/googlebenchmark/src/thread_timer.h b/ThirdParty/googlebenchmark/src/thread_timer.h
index fbd298d3bd..eb23f59561 100644
--- a/ThirdParty/googlebenchmark/src/thread_timer.h
+++ b/ThirdParty/googlebenchmark/src/thread_timer.h
@@ -28,7 +28,7 @@ class ThreadTimer {
 
   // Called by each thread
   void StopTimer() {
-    CHECK(running_);
+    BM_CHECK(running_);
     running_ = false;
     real_time_used_ += ChronoClockNow() - start_real_time_;
     // Floating point error can result in the subtraction producing a negative
@@ -43,20 +43,20 @@ class ThreadTimer {
   bool running() const { return running_; }
 
   // REQUIRES: timer is not running
-  double real_time_used() {
-    CHECK(!running_);
+  double real_time_used() const {
+    BM_CHECK(!running_);
     return real_time_used_;
   }
 
   // REQUIRES: timer is not running
-  double cpu_time_used() {
-    CHECK(!running_);
+  double cpu_time_used() const {
+    BM_CHECK(!running_);
     return cpu_time_used_;
   }
 
   // REQUIRES: timer is not running
-  double manual_time_used() {
-    CHECK(!running_);
+  double manual_time_used() const {
+    BM_CHECK(!running_);
     return manual_time_used_;
   }
 
diff --git a/ThirdParty/googlebenchmark/src/timers.cc b/ThirdParty/googlebenchmark/src/timers.cc
index 7613ff92c6..b23feea8ba 100644
--- a/ThirdParty/googlebenchmark/src/timers.cc
+++ b/ThirdParty/googlebenchmark/src/timers.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "timers.h"
+
 #include "internal_macros.h"
 
 #ifdef BENCHMARK_OS_WINDOWS
@@ -22,13 +23,14 @@
 #include <windows.h>
 #else
 #include <fcntl.h>
-#ifndef BENCHMARK_OS_FUCHSIA
+#if !defined(BENCHMARK_OS_FUCHSIA) && !defined(BENCHMARK_OS_QURT)
 #include <sys/resource.h>
 #endif
 #include <sys/time.h>
 #include <sys/types.h>  // this header must be included before 'sys/sysctl.h' to avoid compilation error on FreeBSD
 #include <unistd.h>
-#if defined BENCHMARK_OS_FREEBSD || defined BENCHMARK_OS_MACOSX
+#if defined BENCHMARK_OS_FREEBSD || defined BENCHMARK_OS_DRAGONFLY || \
+    defined BENCHMARK_OS_MACOSX
 #include <sys/sysctl.h>
 #endif
 #if defined(BENCHMARK_OS_MACOSX)
@@ -36,6 +38,9 @@
 #include <mach/mach_port.h>
 #include <mach/thread_act.h>
 #endif
+#if defined(BENCHMARK_OS_QURT)
+#include <qurt.h>
+#endif
 #endif
 
 #ifdef BENCHMARK_OS_EMSCRIPTEN
@@ -54,7 +59,6 @@
 
 #include "check.h"
 #include "log.h"
-#include "sleep.h"
 #include "string_util.h"
 
 namespace benchmark {
@@ -63,6 +67,9 @@ namespace benchmark {
 #if defined(__GNUC__)
 #pragma GCC diagnostic ignored "-Wunused-function"
 #endif
+#if defined(__NVCOMPILER)
+#pragma diag_suppress declared_but_not_referenced
+#endif
 
 namespace {
 #if defined(BENCHMARK_OS_WINDOWS)
@@ -77,7 +84,7 @@ double MakeTime(FILETIME const& kernel_time, FILETIME const& user_time) {
           static_cast<double>(user.QuadPart)) *
          1e-7;
 }
-#elif !defined(BENCHMARK_OS_FUCHSIA)
+#elif !defined(BENCHMARK_OS_FUCHSIA) && !defined(BENCHMARK_OS_QURT)
 double MakeTime(struct rusage const& ru) {
   return (static_cast<double>(ru.ru_utime.tv_sec) +
           static_cast<double>(ru.ru_utime.tv_usec) * 1e-6 +
@@ -117,15 +124,19 @@ double ProcessCPUUsage() {
                       &user_time))
     return MakeTime(kernel_time, user_time);
   DiagnoseAndExit("GetProccessTimes() failed");
+#elif defined(BENCHMARK_OS_QURT)
+  return static_cast<double>(
+             qurt_timer_timetick_to_us(qurt_timer_get_ticks())) *
+         1.0e-6;
 #elif defined(BENCHMARK_OS_EMSCRIPTEN)
   // clock_gettime(CLOCK_PROCESS_CPUTIME_ID, ...) returns 0 on Emscripten.
   // Use Emscripten-specific API. Reported CPU time would be exactly the
   // same as total time, but this is ok because there aren't long-latency
-  // syncronous system calls in Emscripten.
+  // synchronous system calls in Emscripten.
   return emscripten_get_now() * 1e-3;
 #elif defined(CLOCK_PROCESS_CPUTIME_ID) && !defined(BENCHMARK_OS_MACOSX)
-  // FIXME We want to use clock_gettime, but its not available in MacOS 10.11. See
-  // https://github.com/google/benchmark/pull/292
+  // FIXME We want to use clock_gettime, but its not available in MacOS 10.11.
+  // See https://github.com/google/benchmark/pull/292
   struct timespec spec;
   if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &spec) == 0)
     return MakeTime(spec);
@@ -147,14 +158,19 @@ double ThreadCPUUsage() {
   GetThreadTimes(this_thread, &creation_time, &exit_time, &kernel_time,
                  &user_time);
   return MakeTime(kernel_time, user_time);
+#elif defined(BENCHMARK_OS_QURT)
+  return static_cast<double>(
+             qurt_timer_timetick_to_us(qurt_timer_get_ticks())) *
+         1.0e-6;
 #elif defined(BENCHMARK_OS_MACOSX)
-  // FIXME We want to use clock_gettime, but its not available in MacOS 10.11. See
-  // https://github.com/google/benchmark/pull/292
+  // FIXME We want to use clock_gettime, but its not available in MacOS 10.11.
+  // See https://github.com/google/benchmark/pull/292
   mach_msg_type_number_t count = THREAD_BASIC_INFO_COUNT;
   thread_basic_info_data_t info;
   mach_port_t thread = pthread_mach_thread_np(pthread_self());
-  if (thread_info(thread, THREAD_BASIC_INFO, (thread_info_t)&info, &count) ==
-      KERN_SUCCESS) {
+  if (thread_info(thread, THREAD_BASIC_INFO,
+                  reinterpret_cast<thread_info_t>(&info),
+                  &count) == KERN_SUCCESS) {
     return MakeTime(info);
   }
   DiagnoseAndExit("ThreadCPUUsage() failed when evaluating thread_info");
@@ -178,40 +194,79 @@ double ThreadCPUUsage() {
 #endif
 }
 
-namespace {
-
-std::string DateTimeString(bool local) {
+std::string LocalDateTimeString() {
+  // Write the local time in RFC3339 format yyyy-mm-ddTHH:MM:SS+/-HH:MM.
   typedef std::chrono::system_clock Clock;
   std::time_t now = Clock::to_time_t(Clock::now());
-  const std::size_t kStorageSize = 128;
-  char storage[kStorageSize];
-  std::size_t written;
+  const std::size_t kTzOffsetLen = 6;
+  const std::size_t kTimestampLen = 19;
+
+  std::size_t tz_len;
+  std::size_t timestamp_len;
+  long int offset_minutes;
+  char tz_offset_sign = '+';
+  // tz_offset is set in one of three ways:
+  // * strftime with %z - This either returns empty or the ISO 8601 time.  The
+  // maximum length an
+  //   ISO 8601 string can be is 7 (e.g. -03:30, plus trailing zero).
+  // * snprintf with %c%02li:%02li - The maximum length is 41 (one for %c, up to
+  // 19 for %02li,
+  //   one for :, up to 19 %02li, plus trailing zero).
+  // * A fixed string of "-00:00".  The maximum length is 7 (-00:00, plus
+  // trailing zero).
+  //
+  // Thus, the maximum size this needs to be is 41.
+  char tz_offset[41];
+  // Long enough buffer to avoid format-overflow warnings
+  char storage[128];
 
-  if (local) {
 #if defined(BENCHMARK_OS_WINDOWS)
-    written =
-        std::strftime(storage, sizeof(storage), "%x %X", ::localtime(&now));
+  std::tm* timeinfo_p = ::localtime(&now);
 #else
-    std::tm timeinfo;
-    ::localtime_r(&now, &timeinfo);
-    written = std::strftime(storage, sizeof(storage), "%F %T", &timeinfo);
+  std::tm timeinfo;
+  std::tm* timeinfo_p = &timeinfo;
+  ::localtime_r(&now, &timeinfo);
 #endif
+
+  tz_len = std::strftime(tz_offset, sizeof(tz_offset), "%z", timeinfo_p);
+
+  if (tz_len < kTzOffsetLen && tz_len > 1) {
+    // Timezone offset was written. strftime writes offset as +HHMM or -HHMM,
+    // RFC3339 specifies an offset as +HH:MM or -HH:MM. To convert, we parse
+    // the offset as an integer, then reprint it to a string.
+
+    offset_minutes = ::strtol(tz_offset, NULL, 10);
+    if (offset_minutes < 0) {
+      offset_minutes *= -1;
+      tz_offset_sign = '-';
+    }
+
+    tz_len =
+        ::snprintf(tz_offset, sizeof(tz_offset), "%c%02li:%02li",
+                   tz_offset_sign, offset_minutes / 100, offset_minutes % 100);
+    BM_CHECK(tz_len == kTzOffsetLen);
+    ((void)tz_len);  // Prevent unused variable warning in optimized build.
   } else {
+    // Unknown offset. RFC3339 specifies that unknown local offsets should be
+    // written as UTC time with -00:00 timezone.
 #if defined(BENCHMARK_OS_WINDOWS)
-    written = std::strftime(storage, sizeof(storage), "%x %X", ::gmtime(&now));
+    // Potential race condition if another thread calls localtime or gmtime.
+    timeinfo_p = ::gmtime(&now);
 #else
-    std::tm timeinfo;
     ::gmtime_r(&now, &timeinfo);
-    written = std::strftime(storage, sizeof(storage), "%F %T", &timeinfo);
 #endif
+
+    strncpy(tz_offset, "-00:00", kTzOffsetLen + 1);
   }
-  CHECK(written < kStorageSize);
-  ((void)written);  // prevent unused variable in optimized mode.
-  return std::string(storage);
-}
 
-}  // end namespace
+  timestamp_len =
+      std::strftime(storage, sizeof(storage), "%Y-%m-%dT%H:%M:%S", timeinfo_p);
+  BM_CHECK(timestamp_len == kTimestampLen);
+  // Prevent unused variable warning in optimized build.
+  ((void)kTimestampLen);
 
-std::string LocalDateTimeString() { return DateTimeString(true); }
+  std::strncat(storage, tz_offset, sizeof(storage) - timestamp_len - 1);
+  return std::string(storage);
+}
 
 }  // end namespace benchmark
diff --git a/ThirdParty/googlebenchmark/tools/compare.py b/ThirdParty/googlebenchmark/tools/compare.py
index 539ace6fb1..e5eeb247e6 100755
--- a/ThirdParty/googlebenchmark/tools/compare.py
+++ b/ThirdParty/googlebenchmark/tools/compare.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 import unittest
 """
@@ -7,26 +7,30 @@
 
 import argparse
 from argparse import ArgumentParser
+import json
 import sys
+import os
 import gbench
 from gbench import util, report
-from gbench.util import *
 
 
 def check_inputs(in1, in2, flags):
     """
     Perform checking on the user provided inputs and diagnose any abnormalities
     """
-    in1_kind, in1_err = classify_input_file(in1)
-    in2_kind, in2_err = classify_input_file(in2)
-    output_file = find_benchmark_flag('--benchmark_out=', flags)
-    output_type = find_benchmark_flag('--benchmark_out_format=', flags)
-    if in1_kind == IT_Executable and in2_kind == IT_Executable and output_file:
+    in1_kind, in1_err = util.classify_input_file(in1)
+    in2_kind, in2_err = util.classify_input_file(in2)
+    output_file = util.find_benchmark_flag('--benchmark_out=', flags)
+    output_type = util.find_benchmark_flag('--benchmark_out_format=', flags)
+    if in1_kind == util.IT_Executable and in2_kind == util.IT_Executable and output_file:
         print(("WARNING: '--benchmark_out=%s' will be passed to both "
                "benchmarks causing it to be overwritten") % output_file)
-    if in1_kind == IT_JSON and in2_kind == IT_JSON and len(flags) > 0:
-        print("WARNING: passing optional flags has no effect since both "
-              "inputs are JSON")
+    if in1_kind == util.IT_JSON and in2_kind == util.IT_JSON:
+        # When both sides are JSON the only supported flag is
+        # --benchmark_filter=
+        for flag in util.remove_benchmark_flags('--benchmark_filter=', flags):
+            print("WARNING: passing %s has no effect since both "
+                  "inputs are JSON" % flag)
     if output_type is not None and output_type != 'json':
         print(("ERROR: passing '--benchmark_out_format=%s' to 'compare.py`"
                " is not supported.") % output_type)
@@ -48,6 +52,20 @@ def create_parser():
              "of repetitions. Do note that only the display is affected. "
              "Internally, all the actual runs are still used, e.g. for U test.")
 
+    parser.add_argument(
+        '--no-color',
+        dest='color',
+        default=True,
+        action="store_false",
+        help="Do not use colors in the terminal output"
+    )
+
+    parser.add_argument(
+        '-d',
+        '--dump_to_json',
+        dest='dump_to_json',
+        help="Additionally, dump benchmark comparison output to this file in JSON format.")
+
     utest = parser.add_argument_group()
     utest.add_argument(
         '--no-utest',
@@ -223,10 +241,10 @@ def main():
         options_contender = ['--benchmark_filter=%s' % filter_contender]
 
     # Run the benchmarks and report the results
-    json1 = json1_orig = gbench.util.run_or_load_benchmark(
-        test_baseline, benchmark_options + options_baseline)
-    json2 = json2_orig = gbench.util.run_or_load_benchmark(
-        test_contender, benchmark_options + options_contender)
+    json1 = json1_orig = gbench.util.sort_benchmark_results(gbench.util.run_or_load_benchmark(
+        test_baseline, benchmark_options + options_baseline))
+    json2 = json2_orig = gbench.util.sort_benchmark_results(gbench.util.run_or_load_benchmark(
+        test_contender, benchmark_options + options_contender))
 
     # Now, filter the benchmarks so that the difference report can work
     if filter_baseline and filter_contender:
@@ -236,14 +254,20 @@ def main():
         json2 = gbench.report.filter_benchmark(
             json2_orig, filter_contender, replacement)
 
-    # Diff and output
-    output_lines = gbench.report.generate_difference_report(
-        json1, json2, args.display_aggregates_only,
-        args.utest, args.utest_alpha)
+    diff_report = gbench.report.get_difference_report(
+        json1, json2, args.utest)
+    output_lines = gbench.report.print_difference_report(
+        diff_report,
+        args.display_aggregates_only,
+        args.utest, args.utest_alpha, args.color)
     print(description)
     for ln in output_lines:
         print(ln)
 
+    # Optionally, diff and output to JSON
+    if args.dump_to_json is not None:
+        with open(args.dump_to_json, 'w') as f_json:
+            json.dump(diff_report, f_json)
 
 class TestParser(unittest.TestCase):
     def setUp(self):
diff --git a/ThirdParty/googlebenchmark/tools/gbench/Inputs/test1_run1.json b/ThirdParty/googlebenchmark/tools/gbench/Inputs/test1_run1.json
index 601e327aef..9daed0bcc6 100644
--- a/ThirdParty/googlebenchmark/tools/gbench/Inputs/test1_run1.json
+++ b/ThirdParty/googlebenchmark/tools/gbench/Inputs/test1_run1.json
@@ -114,6 +114,14 @@
       "real_time": 1,
       "cpu_time": 1,
       "time_unit": "s"
+    },
+    {
+      "name": "BM_hasLabel",
+      "label": "a label",
+      "iterations": 1,
+      "real_time": 1,
+      "cpu_time": 1,
+      "time_unit": "s"
     }
   ]
 }
diff --git a/ThirdParty/googlebenchmark/tools/gbench/Inputs/test1_run2.json b/ThirdParty/googlebenchmark/tools/gbench/Inputs/test1_run2.json
index 3cbcf39b0c..dc52970abf 100644
--- a/ThirdParty/googlebenchmark/tools/gbench/Inputs/test1_run2.json
+++ b/ThirdParty/googlebenchmark/tools/gbench/Inputs/test1_run2.json
@@ -114,6 +114,14 @@
       "real_time": 1,
       "cpu_time": 1,
       "time_unit": "ns"
+    },
+    {
+      "name": "BM_hasLabel",
+      "label": "a label",
+      "iterations": 1,
+      "real_time": 1,
+      "cpu_time": 1,
+      "time_unit": "s"
     }
   ]
 }
diff --git a/ThirdParty/googlebenchmark/tools/gbench/Inputs/test4_run.json b/ThirdParty/googlebenchmark/tools/gbench/Inputs/test4_run.json
new file mode 100644
index 0000000000..eaa005f3a9
--- /dev/null
+++ b/ThirdParty/googlebenchmark/tools/gbench/Inputs/test4_run.json
@@ -0,0 +1,96 @@
+{
+  "benchmarks": [
+    {
+      "name": "99 family 0 instance 0 repetition 0",
+      "run_type": "iteration",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "repetition_index": 0
+    },
+    {
+      "name": "98 family 0 instance 0 repetition 1",
+      "run_type": "iteration",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "repetition_index": 1
+    },
+    {
+      "name": "97 family 0 instance 0 aggregate",
+      "run_type": "aggregate",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "aggregate_name": "9 aggregate"
+    },
+
+
+    {
+      "name": "96 family 0 instance 1 repetition 0",
+      "run_type": "iteration",
+      "family_index": 0,
+      "per_family_instance_index": 1,
+      "repetition_index": 0
+    },
+    {
+      "name": "95 family 0 instance 1 repetition 1",
+      "run_type": "iteration",
+      "family_index": 0,
+      "per_family_instance_index": 1,
+      "repetition_index": 1
+    },
+    {
+      "name": "94 family 0 instance 1 aggregate",
+      "run_type": "aggregate",
+      "family_index": 0,
+      "per_family_instance_index": 1,
+      "aggregate_name": "9 aggregate"
+    },
+
+
+
+
+    {
+      "name": "93 family 1 instance 0 repetition 0",
+      "run_type": "iteration",
+      "family_index": 1,
+      "per_family_instance_index": 0,
+      "repetition_index": 0
+    },
+    {
+      "name": "92 family 1 instance 0 repetition 1",
+      "run_type": "iteration",
+      "family_index": 1,
+      "per_family_instance_index": 0,
+      "repetition_index": 1
+    },
+    {
+      "name": "91 family 1 instance 0 aggregate",
+      "run_type": "aggregate",
+      "family_index": 1,
+      "per_family_instance_index": 0,
+      "aggregate_name": "9 aggregate"
+    },
+
+
+    {
+      "name": "90 family 1 instance 1 repetition 0",
+      "run_type": "iteration",
+      "family_index": 1,
+      "per_family_instance_index": 1,
+      "repetition_index": 0
+    },
+    {
+      "name": "89 family 1 instance 1 repetition 1",
+      "run_type": "iteration",
+      "family_index": 1,
+      "per_family_instance_index": 1,
+      "repetition_index": 1
+    },
+    {
+      "name": "88 family 1 instance 1 aggregate",
+      "run_type": "aggregate",
+      "family_index": 1,
+      "per_family_instance_index": 1,
+      "aggregate_name": "9 aggregate"
+    }
+  ]
+}
diff --git a/ThirdParty/googlebenchmark/tools/gbench/Inputs/test4_run0.json b/ThirdParty/googlebenchmark/tools/gbench/Inputs/test4_run0.json
new file mode 100644
index 0000000000..54cf127585
--- /dev/null
+++ b/ThirdParty/googlebenchmark/tools/gbench/Inputs/test4_run0.json
@@ -0,0 +1,21 @@
+{
+  "context": {
+    "date": "2016-08-02 17:44:46",
+    "num_cpus": 4,
+    "mhz_per_cpu": 4228,
+    "cpu_scaling_enabled": false,
+    "library_build_type": "release"
+  },
+  "benchmarks": [
+    {
+      "name": "whocares",
+      "run_type": "aggregate",
+      "aggregate_name": "zz",
+      "aggregate_unit": "percentage",
+      "iterations": 1000,
+      "real_time": 0.01,
+      "cpu_time": 0.10,
+      "time_unit": "ns"
+    }
+  ]
+}
diff --git a/ThirdParty/googlebenchmark/tools/gbench/Inputs/test4_run1.json b/ThirdParty/googlebenchmark/tools/gbench/Inputs/test4_run1.json
new file mode 100644
index 0000000000..25d56050c9
--- /dev/null
+++ b/ThirdParty/googlebenchmark/tools/gbench/Inputs/test4_run1.json
@@ -0,0 +1,21 @@
+{
+  "context": {
+    "date": "2016-08-02 17:44:46",
+    "num_cpus": 4,
+    "mhz_per_cpu": 4228,
+    "cpu_scaling_enabled": false,
+    "library_build_type": "release"
+  },
+  "benchmarks": [
+    {
+      "name": "whocares",
+      "run_type": "aggregate",
+      "aggregate_name": "zz",
+      "aggregate_unit": "percentage",
+      "iterations": 1000,
+      "real_time": 0.005,
+      "cpu_time": 0.15,
+      "time_unit": "ns"
+    }
+  ]
+}
diff --git a/ThirdParty/googlebenchmark/tools/gbench/report.py b/ThirdParty/googlebenchmark/tools/gbench/report.py
index 5bd3a8d85d..b2bbfb9f62 100644
--- a/ThirdParty/googlebenchmark/tools/gbench/report.py
+++ b/ThirdParty/googlebenchmark/tools/gbench/report.py
@@ -1,11 +1,14 @@
-import unittest
 """report.py - Utilities for reporting statistics about benchmark results
 """
+
+import unittest
 import os
 import re
 import copy
+import random
 
-from scipy.stats import mannwhitneyu
+from scipy.stats import mannwhitneyu, gmean
+from numpy import array
 
 
 class BenchmarkColor(object):
@@ -39,6 +42,13 @@ def __format__(self, format):
 UTEST_OPTIMAL_REPETITIONS = 9  # Lowest reasonable number, More is better.
 UTEST_COL_NAME = "_pvalue"
 
+_TIME_UNIT_TO_SECONDS_MULTIPLIER = {
+    "s": 1.0,
+    "ms": 1e-3,
+    "us": 1e-6,
+    "ns": 1e-9,
+}
+
 
 def color_format(use_color, fmt_str, *args, **kwargs):
     """
@@ -148,12 +158,37 @@ def partition_benchmarks(json1, json2):
     return partitions
 
 
+def get_timedelta_field_as_seconds(benchmark, field_name):
+    """
+    Get value of field_name field of benchmark, which is time with time unit
+    time_unit, as time in seconds.
+    """
+    timedelta = benchmark[field_name]
+    time_unit = benchmark.get('time_unit', 's')
+    return timedelta * _TIME_UNIT_TO_SECONDS_MULTIPLIER.get(time_unit)
+
+
+def calculate_geomean(json):
+    """
+    Extract all real/cpu times from all the benchmarks as seconds,
+    and calculate their geomean.
+    """
+    times = []
+    for benchmark in json['benchmarks']:
+        if 'run_type' in benchmark and benchmark['run_type'] == 'aggregate':
+            continue
+        times.append([get_timedelta_field_as_seconds(benchmark, 'real_time'),
+                      get_timedelta_field_as_seconds(benchmark, 'cpu_time')])
+    return gmean(times) if times else array([])
+
+
 def extract_field(partition, field_name):
     # The count of elements may be different. We want *all* of them.
     lhs = [x[field_name] for x in partition[0]]
     rhs = [x[field_name] for x in partition[1]]
     return [lhs, rhs]
 
+
 def calc_utest(timings_cpu, timings_time):
     min_rep_cnt = min(len(timings_time[0]),
                       len(timings_time[1]),
@@ -171,46 +206,130 @@ def calc_utest(timings_cpu, timings_time):
 
     return (min_rep_cnt >= UTEST_OPTIMAL_REPETITIONS), cpu_pvalue, time_pvalue
 
-def print_utest(partition, utest_alpha, first_col_width, use_color=True):
+
+def print_utest(bc_name, utest, utest_alpha, first_col_width, use_color=True):
     def get_utest_color(pval):
         return BC_FAIL if pval >= utest_alpha else BC_OKGREEN
 
-    timings_time = extract_field(partition, 'real_time')
-    timings_cpu = extract_field(partition, 'cpu_time')
-    have_optimal_repetitions, cpu_pvalue, time_pvalue = calc_utest(timings_cpu, timings_time)
-
     # Check if we failed miserably with minimum required repetitions for utest
-    if not have_optimal_repetitions and cpu_pvalue is None and time_pvalue is None:
+    if not utest['have_optimal_repetitions'] and utest['cpu_pvalue'] is None and utest['time_pvalue'] is None:
         return []
 
     dsc = "U Test, Repetitions: {} vs {}".format(
-        len(timings_cpu[0]), len(timings_cpu[1]))
+        utest['nr_of_repetitions'], utest['nr_of_repetitions_other'])
     dsc_color = BC_OKGREEN
 
     # We still got some results to show but issue a warning about it.
-    if not have_optimal_repetitions:
+    if not utest['have_optimal_repetitions']:
         dsc_color = BC_WARNING
         dsc += ". WARNING: Results unreliable! {}+ repetitions recommended.".format(
             UTEST_OPTIMAL_REPETITIONS)
 
     special_str = "{}{:<{}s}{endc}{}{:16.4f}{endc}{}{:16.4f}{endc}{}      {}"
 
-    last_name = partition[0][0]['name']
     return [color_format(use_color,
                          special_str,
                          BC_HEADER,
-                         "{}{}".format(last_name, UTEST_COL_NAME),
+                         "{}{}".format(bc_name, UTEST_COL_NAME),
                          first_col_width,
-                         get_utest_color(time_pvalue), time_pvalue,
-                         get_utest_color(cpu_pvalue), cpu_pvalue,
+                         get_utest_color(
+                             utest['time_pvalue']), utest['time_pvalue'],
+                         get_utest_color(
+                             utest['cpu_pvalue']), utest['cpu_pvalue'],
                          dsc_color, dsc,
                          endc=BC_ENDC)]
 
 
-def generate_difference_report(
+def get_difference_report(
         json1,
         json2,
-        display_aggregates_only=False,
+        utest=False):
+    """
+    Calculate and report the difference between each test of two benchmarks
+    runs specified as 'json1' and 'json2'. Output is another json containing
+    relevant details for each test run.
+    """
+    assert utest is True or utest is False
+
+    diff_report = []
+    partitions = partition_benchmarks(json1, json2)
+    for partition in partitions:
+        benchmark_name = partition[0][0]['name']
+        label = partition[0][0]['label'] if 'label' in partition[0][0] else ''
+        time_unit = partition[0][0]['time_unit']
+        measurements = []
+        utest_results = {}
+        # Careful, we may have different repetition count.
+        for i in range(min(len(partition[0]), len(partition[1]))):
+            bn = partition[0][i]
+            other_bench = partition[1][i]
+            measurements.append({
+                'real_time': bn['real_time'],
+                'cpu_time': bn['cpu_time'],
+                'real_time_other': other_bench['real_time'],
+                'cpu_time_other': other_bench['cpu_time'],
+                'time': calculate_change(bn['real_time'], other_bench['real_time']),
+                'cpu': calculate_change(bn['cpu_time'], other_bench['cpu_time'])
+            })
+
+        # After processing the whole partition, if requested, do the U test.
+        if utest:
+            timings_cpu = extract_field(partition, 'cpu_time')
+            timings_time = extract_field(partition, 'real_time')
+            have_optimal_repetitions, cpu_pvalue, time_pvalue = calc_utest(
+                timings_cpu, timings_time)
+            if cpu_pvalue and time_pvalue:
+                utest_results = {
+                    'have_optimal_repetitions': have_optimal_repetitions,
+                    'cpu_pvalue': cpu_pvalue,
+                    'time_pvalue': time_pvalue,
+                    'nr_of_repetitions': len(timings_cpu[0]),
+                    'nr_of_repetitions_other': len(timings_cpu[1])
+                }
+
+        # Store only if we had any measurements for given benchmark.
+        # E.g. partition_benchmarks will filter out the benchmarks having
+        # time units which are not compatible with other time units in the
+        # benchmark suite.
+        if measurements:
+            run_type = partition[0][0]['run_type'] if 'run_type' in partition[0][0] else ''
+            aggregate_name = partition[0][0]['aggregate_name'] if run_type == 'aggregate' and 'aggregate_name' in partition[0][0] else ''
+            diff_report.append({
+                'name': benchmark_name,
+                'label': label,
+                'measurements': measurements,
+                'time_unit': time_unit,
+                'run_type': run_type,
+                'aggregate_name': aggregate_name,
+                'utest': utest_results
+            })
+
+    lhs_gmean = calculate_geomean(json1)
+    rhs_gmean = calculate_geomean(json2)
+    if lhs_gmean.any() and rhs_gmean.any():
+        diff_report.append({
+            'name': 'OVERALL_GEOMEAN',
+            'label': '',
+            'measurements': [{
+                'real_time': lhs_gmean[0],
+                'cpu_time': lhs_gmean[1],
+                'real_time_other': rhs_gmean[0],
+                'cpu_time_other': rhs_gmean[1],
+                'time': calculate_change(lhs_gmean[0], rhs_gmean[0]),
+                'cpu': calculate_change(lhs_gmean[1], rhs_gmean[1])
+            }],
+            'time_unit': 's',
+            'run_type': 'aggregate',
+            'aggregate_name': 'geomean',
+            'utest': {}
+        })
+
+    return diff_report
+
+
+def print_difference_report(
+        json_diff_report,
+        include_aggregates_only=False,
         utest=False,
         utest_alpha=0.05,
         use_color=True):
@@ -219,14 +338,16 @@ def generate_difference_report(
     runs specified as 'json1' and 'json2'.
     """
     assert utest is True or utest is False
-    first_col_width = find_longest_name(json1['benchmarks'])
 
-    def find_test(name):
-        for b in json2['benchmarks']:
-            if b['name'] == name:
-                return b
-        return None
+    def get_color(res):
+        if res > 0.05:
+            return BC_FAIL
+        elif res > -0.07:
+            return BC_WHITE
+        else:
+            return BC_CYAN
 
+    first_col_width = find_longest_name(json_diff_report)
     first_col_width = max(
         first_col_width,
         len('Benchmark'))
@@ -235,50 +356,33 @@ def find_test(name):
         'Benchmark', 12 + first_col_width)
     output_strs = [first_line, '-' * len(first_line)]
 
-    partitions = partition_benchmarks(json1, json2)
-    for partition in partitions:
-        # Careful, we may have different repetition count.
-        for i in range(min(len(partition[0]), len(partition[1]))):
-            bn = partition[0][i]
-            other_bench = partition[1][i]
-
-            # *If* we were asked to only display aggregates,
-            # and if it is non-aggregate, then skip it.
-            if display_aggregates_only and 'run_type' in bn and 'run_type' in other_bench:
-                assert bn['run_type'] == other_bench['run_type']
-                if bn['run_type'] != 'aggregate':
-                    continue
-
-            fmt_str = "{}{:<{}s}{endc}{}{:+16.4f}{endc}{}{:+16.4f}{endc}{:14.0f}{:14.0f}{endc}{:14.0f}{:14.0f}"
-
-            def get_color(res):
-                if res > 0.05:
-                    return BC_FAIL
-                elif res > -0.07:
-                    return BC_WHITE
-                else:
-                    return BC_CYAN
-
-            tres = calculate_change(bn['real_time'], other_bench['real_time'])
-            cpures = calculate_change(bn['cpu_time'], other_bench['cpu_time'])
-            output_strs += [color_format(use_color,
-                                         fmt_str,
-                                         BC_HEADER,
-                                         bn['name'],
-                                         first_col_width,
-                                         get_color(tres),
-                                         tres,
-                                         get_color(cpures),
-                                         cpures,
-                                         bn['real_time'],
-                                         other_bench['real_time'],
-                                         bn['cpu_time'],
-                                         other_bench['cpu_time'],
-                                         endc=BC_ENDC)]
-
-        # After processing the whole partition, if requested, do the U test.
-        if utest:
-            output_strs += print_utest(partition,
+    fmt_str = "{}{:<{}s}{endc}{}{:+16.4f}{endc}{}{:+16.4f}{endc}{:14.0f}{:14.0f}{endc}{:14.0f}{:14.0f}"
+    for benchmark in json_diff_report:
+        # *If* we were asked to only include aggregates,
+        # and if it is non-aggregate, then don't print it.
+        if not include_aggregates_only or not 'run_type' in benchmark or benchmark['run_type'] == 'aggregate':
+            for measurement in benchmark['measurements']:
+                output_strs += [color_format(use_color,
+                                             fmt_str,
+                                             BC_HEADER,
+                                             benchmark['name'],
+                                             first_col_width,
+                                             get_color(measurement['time']),
+                                             measurement['time'],
+                                             get_color(measurement['cpu']),
+                                             measurement['cpu'],
+                                             measurement['real_time'],
+                                             measurement['real_time_other'],
+                                             measurement['cpu_time'],
+                                             measurement['cpu_time_other'],
+                                             endc=BC_ENDC)]
+
+        # After processing the measurements, if requested and
+        # if applicable (e.g. u-test exists for given benchmark),
+        # print the U test.
+        if utest and benchmark['utest']:
+            output_strs += print_utest(benchmark['name'],
+                                       benchmark['utest'],
                                        utest_alpha=utest_alpha,
                                        first_col_width=first_col_width,
                                        use_color=use_color)
@@ -319,21 +423,26 @@ def test_basic(self):
 
 
 class TestReportDifference(unittest.TestCase):
-    def load_results(self):
-        import json
-        testInputs = os.path.join(
-            os.path.dirname(
-                os.path.realpath(__file__)),
-            'Inputs')
-        testOutput1 = os.path.join(testInputs, 'test1_run1.json')
-        testOutput2 = os.path.join(testInputs, 'test1_run2.json')
-        with open(testOutput1, 'r') as f:
-            json1 = json.load(f)
-        with open(testOutput2, 'r') as f:
-            json2 = json.load(f)
-        return json1, json2
-
-    def test_basic(self):
+    @classmethod
+    def setUpClass(cls):
+        def load_results():
+            import json
+            testInputs = os.path.join(
+                os.path.dirname(
+                    os.path.realpath(__file__)),
+                'Inputs')
+            testOutput1 = os.path.join(testInputs, 'test1_run1.json')
+            testOutput2 = os.path.join(testInputs, 'test1_run2.json')
+            with open(testOutput1, 'r') as f:
+                json1 = json.load(f)
+            with open(testOutput2, 'r') as f:
+                json2 = json.load(f)
+            return json1, json2
+
+        json1, json2 = load_results()
+        cls.json_diff_report = get_difference_report(json1, json2)
+
+    def test_json_diff_report_pretty_printing(self):
         expect_lines = [
             ['BM_SameTimes', '+0.0000', '+0.0000', '10', '10', '10', '10'],
             ['BM_2xFaster', '-0.5000', '-0.5000', '50', '25', '50', '25'],
@@ -350,10 +459,11 @@ def test_basic(self):
                 '-0.1000', '100', '110', '100', '90'],
             ['BM_ThirdFaster', '-0.3333', '-0.3334', '100', '67', '100', '67'],
             ['BM_NotBadTimeUnit', '-0.9000', '+0.2000', '0', '0', '0', '1'],
+            ['BM_hasLabel', '+0.0000', '+0.0000', '1', '1', '1', '1'],
+            ['OVERALL_GEOMEAN', '-0.8113', '-0.7779', '0', '0', '0', '0']
         ]
-        json1, json2 = self.load_results()
-        output_lines_with_header = generate_difference_report(
-            json1, json2, use_color=False)
+        output_lines_with_header = print_difference_report(
+            self.json_diff_report, use_color=False)
         output_lines = output_lines_with_header[2:]
         print("\n")
         print("\n".join(output_lines_with_header))
@@ -363,31 +473,175 @@ def test_basic(self):
             self.assertEqual(len(parts), 7)
             self.assertEqual(expect_lines[i], parts)
 
+    def test_json_diff_report_output(self):
+        expected_output = [
+            {
+                'name': 'BM_SameTimes',
+                'label': '',
+                'measurements': [{'time': 0.0000, 'cpu': 0.0000,
+                                  'real_time': 10, 'real_time_other': 10,
+                                  'cpu_time': 10, 'cpu_time_other': 10}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_2xFaster',
+                'label': '',
+                'measurements': [{'time': -0.5000, 'cpu': -0.5000,
+                                  'real_time': 50, 'real_time_other': 25,
+                                  'cpu_time': 50, 'cpu_time_other': 25}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_2xSlower',
+                'label': '',
+                'measurements': [{'time': 1.0000, 'cpu': 1.0000,
+                                  'real_time': 50, 'real_time_other': 100,
+                                  'cpu_time': 50, 'cpu_time_other': 100}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_1PercentFaster',
+                'label': '',
+                'measurements': [{'time': -0.0100, 'cpu': -0.0100,
+                                  'real_time': 100, 'real_time_other': 98.9999999,
+                                  'cpu_time': 100, 'cpu_time_other': 98.9999999}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_1PercentSlower',
+                'label': '',
+                'measurements': [{'time': 0.0100, 'cpu': 0.0100,
+                                  'real_time': 100, 'real_time_other': 101,
+                                  'cpu_time': 100, 'cpu_time_other': 101}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_10PercentFaster',
+                'label': '',
+                'measurements': [{'time': -0.1000, 'cpu': -0.1000,
+                                  'real_time': 100, 'real_time_other': 90,
+                                  'cpu_time': 100, 'cpu_time_other': 90}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_10PercentSlower',
+                'label': '',
+                'measurements': [{'time': 0.1000, 'cpu': 0.1000,
+                                  'real_time': 100, 'real_time_other': 110,
+                                  'cpu_time': 100, 'cpu_time_other': 110}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_100xSlower',
+                'label': '',
+                'measurements': [{'time': 99.0000, 'cpu': 99.0000,
+                                  'real_time': 100, 'real_time_other': 10000,
+                                  'cpu_time': 100, 'cpu_time_other': 10000}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_100xFaster',
+                'label': '',
+                'measurements': [{'time': -0.9900, 'cpu': -0.9900,
+                                  'real_time': 10000, 'real_time_other': 100,
+                                  'cpu_time': 10000, 'cpu_time_other': 100}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_10PercentCPUToTime',
+                'label': '',
+                'measurements': [{'time': 0.1000, 'cpu': -0.1000,
+                                  'real_time': 100, 'real_time_other': 110,
+                                  'cpu_time': 100, 'cpu_time_other': 90}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_ThirdFaster',
+                'label': '',
+                'measurements': [{'time': -0.3333, 'cpu': -0.3334,
+                                  'real_time': 100, 'real_time_other': 67,
+                                  'cpu_time': 100, 'cpu_time_other': 67}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_NotBadTimeUnit',
+                'label': '',
+                'measurements': [{'time': -0.9000, 'cpu': 0.2000,
+                                  'real_time': 0.4, 'real_time_other': 0.04,
+                                  'cpu_time': 0.5, 'cpu_time_other': 0.6}],
+                'time_unit': 's',
+                'utest': {}
+            },
+            {
+                'name': 'BM_hasLabel',
+                'label': 'a label',
+                'measurements': [{'time': 0.0000, 'cpu': 0.0000,
+                                  'real_time': 1, 'real_time_other': 1,
+                                  'cpu_time': 1, 'cpu_time_other': 1}],
+                'time_unit': 's',
+                'utest': {}
+            },
+            {
+                'name': 'OVERALL_GEOMEAN',
+                'label': '',
+                'measurements': [{'real_time': 3.1622776601683826e-06, 'cpu_time': 3.2130844755623912e-06,
+                                  'real_time_other': 1.9768988699420897e-07, 'cpu_time_other': 2.397447755209533e-07,
+                                  'time': -0.8112976497120911, 'cpu': -0.7778551721181174}],
+                'time_unit': 's',
+                'run_type': 'aggregate',
+                'aggregate_name': 'geomean', 'utest': {}
+            },
+        ]
+        self.assertEqual(len(self.json_diff_report), len(expected_output))
+        for out, expected in zip(
+                self.json_diff_report, expected_output):
+            self.assertEqual(out['name'], expected['name'])
+            self.assertEqual(out['label'], expected['label'])
+            self.assertEqual(out['time_unit'], expected['time_unit'])
+            assert_utest(self, out, expected)
+            assert_measurements(self, out, expected)
+
 
 class TestReportDifferenceBetweenFamilies(unittest.TestCase):
-    def load_result(self):
-        import json
-        testInputs = os.path.join(
-            os.path.dirname(
-                os.path.realpath(__file__)),
-            'Inputs')
-        testOutput = os.path.join(testInputs, 'test2_run.json')
-        with open(testOutput, 'r') as f:
-            json = json.load(f)
-        return json
+    @classmethod
+    def setUpClass(cls):
+        def load_result():
+            import json
+            testInputs = os.path.join(
+                os.path.dirname(
+                    os.path.realpath(__file__)),
+                'Inputs')
+            testOutput = os.path.join(testInputs, 'test2_run.json')
+            with open(testOutput, 'r') as f:
+                json = json.load(f)
+            return json
+
+        json = load_result()
+        json1 = filter_benchmark(json, "BM_Z.ro", ".")
+        json2 = filter_benchmark(json, "BM_O.e", ".")
+        cls.json_diff_report = get_difference_report(json1, json2)
 
-    def test_basic(self):
+    def test_json_diff_report_pretty_printing(self):
         expect_lines = [
             ['.', '-0.5000', '-0.5000', '10', '5', '10', '5'],
             ['./4', '-0.5000', '-0.5000', '40', '20', '40', '20'],
             ['Prefix/.', '-0.5000', '-0.5000', '20', '10', '20', '10'],
             ['Prefix/./3', '-0.5000', '-0.5000', '30', '15', '30', '15'],
+            ['OVERALL_GEOMEAN', '-0.5000', '-0.5000', '0', '0', '0', '0']
         ]
-        json = self.load_result()
-        json1 = filter_benchmark(json, "BM_Z.ro", ".")
-        json2 = filter_benchmark(json, "BM_O.e", ".")
-        output_lines_with_header = generate_difference_report(
-            json1, json2, use_color=False)
+        output_lines_with_header = print_difference_report(
+            self.json_diff_report, use_color=False)
         output_lines = output_lines_with_header[2:]
         print("\n")
         print("\n".join(output_lines_with_header))
@@ -397,31 +651,81 @@ def test_basic(self):
             self.assertEqual(len(parts), 7)
             self.assertEqual(expect_lines[i], parts)
 
+    def test_json_diff_report(self):
+        expected_output = [
+            {
+                'name': u'.',
+                'measurements': [{'time': -0.5, 'cpu': -0.5, 'real_time': 10, 'real_time_other': 5, 'cpu_time': 10, 'cpu_time_other': 5}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': u'./4',
+                'measurements': [{'time': -0.5, 'cpu': -0.5, 'real_time': 40, 'real_time_other': 20, 'cpu_time': 40, 'cpu_time_other': 20}],
+                'time_unit': 'ns',
+                'utest': {},
+            },
+            {
+                'name': u'Prefix/.',
+                'measurements': [{'time': -0.5, 'cpu': -0.5, 'real_time': 20, 'real_time_other': 10, 'cpu_time': 20, 'cpu_time_other': 10}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': u'Prefix/./3',
+                'measurements': [{'time': -0.5, 'cpu': -0.5, 'real_time': 30, 'real_time_other': 15, 'cpu_time': 30, 'cpu_time_other': 15}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'OVERALL_GEOMEAN',
+                'measurements': [{'real_time': 2.213363839400641e-08, 'cpu_time': 2.213363839400641e-08,
+                                  'real_time_other': 1.1066819197003185e-08, 'cpu_time_other': 1.1066819197003185e-08,
+                                  'time': -0.5000000000000009, 'cpu': -0.5000000000000009}],
+                'time_unit': 's',
+                'run_type': 'aggregate',
+                'aggregate_name': 'geomean',
+                'utest': {}
+            }
+        ]
+        self.assertEqual(len(self.json_diff_report), len(expected_output))
+        for out, expected in zip(
+                self.json_diff_report, expected_output):
+            self.assertEqual(out['name'], expected['name'])
+            self.assertEqual(out['time_unit'], expected['time_unit'])
+            assert_utest(self, out, expected)
+            assert_measurements(self, out, expected)
+
 
 class TestReportDifferenceWithUTest(unittest.TestCase):
-    def load_results(self):
-        import json
-        testInputs = os.path.join(
-            os.path.dirname(
-                os.path.realpath(__file__)),
-            'Inputs')
-        testOutput1 = os.path.join(testInputs, 'test3_run0.json')
-        testOutput2 = os.path.join(testInputs, 'test3_run1.json')
-        with open(testOutput1, 'r') as f:
-            json1 = json.load(f)
-        with open(testOutput2, 'r') as f:
-            json2 = json.load(f)
-        return json1, json2
-
-    def test_utest(self):
-        expect_lines = []
+    @classmethod
+    def setUpClass(cls):
+        def load_results():
+            import json
+            testInputs = os.path.join(
+                os.path.dirname(
+                    os.path.realpath(__file__)),
+                'Inputs')
+            testOutput1 = os.path.join(testInputs, 'test3_run0.json')
+            testOutput2 = os.path.join(testInputs, 'test3_run1.json')
+            with open(testOutput1, 'r') as f:
+                json1 = json.load(f)
+            with open(testOutput2, 'r') as f:
+                json2 = json.load(f)
+            return json1, json2
+
+        json1, json2 = load_results()
+        cls.json_diff_report = get_difference_report(
+            json1, json2, utest=True)
+
+    def test_json_diff_report_pretty_printing(self):
         expect_lines = [
             ['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'],
             ['BM_Two', '+0.1111', '-0.0111', '9', '10', '90', '89'],
             ['BM_Two', '-0.1250', '-0.1628', '8', '7', '86', '72'],
             ['BM_Two_pvalue',
-             '0.6985',
-             '0.6985',
+             '1.0000',
+             '0.6667',
              'U',
              'Test,',
              'Repetitions:',
@@ -438,7 +742,7 @@ def test_utest(self):
             ['short', '-0.4325', '-0.1351', '8', '5', '77', '67'],
             ['short_pvalue',
              '0.7671',
-             '0.1489',
+             '0.2000',
              'U',
              'Test,',
              'Repetitions:',
@@ -452,10 +756,10 @@ def test_utest(self):
              'repetitions',
              'recommended.'],
             ['medium', '-0.3750', '-0.3375', '8', '5', '80', '53'],
+            ['OVERALL_GEOMEAN', '+1.6405', '-0.6985', '0', '0', '0', '0']
         ]
-        json1, json2 = self.load_results()
-        output_lines_with_header = generate_difference_report(
-            json1, json2, utest=True, utest_alpha=0.05, use_color=False)
+        output_lines_with_header = print_difference_report(
+            self.json_diff_report, utest=True, utest_alpha=0.05, use_color=False)
         output_lines = output_lines_with_header[2:]
         print("\n")
         print("\n".join(output_lines_with_header))
@@ -464,32 +768,169 @@ def test_utest(self):
             parts = [x for x in output_lines[i].split(' ') if x]
             self.assertEqual(expect_lines[i], parts)
 
+    def test_json_diff_report_pretty_printing_aggregates_only(self):
+        expect_lines = [
+            ['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'],
+            ['BM_Two_pvalue',
+             '1.0000',
+             '0.6667',
+             'U',
+             'Test,',
+             'Repetitions:',
+             '2',
+             'vs',
+             '2.',
+             'WARNING:',
+             'Results',
+             'unreliable!',
+             '9+',
+             'repetitions',
+             'recommended.'],
+            ['short', '-0.1250', '-0.0625', '8', '7', '80', '75'],
+            ['short', '-0.4325', '-0.1351', '8', '5', '77', '67'],
+            ['short_pvalue',
+             '0.7671',
+             '0.2000',
+             'U',
+             'Test,',
+             'Repetitions:',
+             '2',
+             'vs',
+             '3.',
+             'WARNING:',
+             'Results',
+             'unreliable!',
+             '9+',
+             'repetitions',
+             'recommended.'],
+            ['OVERALL_GEOMEAN', '+1.6405', '-0.6985', '0', '0', '0', '0']
+        ]
+        output_lines_with_header = print_difference_report(
+            self.json_diff_report, include_aggregates_only=True, utest=True, utest_alpha=0.05, use_color=False)
+        output_lines = output_lines_with_header[2:]
+        print("\n")
+        print("\n".join(output_lines_with_header))
+        self.assertEqual(len(output_lines), len(expect_lines))
+        for i in range(0, len(output_lines)):
+            parts = [x for x in output_lines[i].split(' ') if x]
+            self.assertEqual(expect_lines[i], parts)
+
+    def test_json_diff_report(self):
+        expected_output = [
+            {
+                'name': u'BM_One',
+                'measurements': [
+                    {'time': -0.1,
+                     'cpu': 0.1,
+                     'real_time': 10,
+                     'real_time_other': 9,
+                     'cpu_time': 100,
+                     'cpu_time_other': 110}
+                ],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': u'BM_Two',
+                'measurements': [
+                    {'time': 0.1111111111111111,
+                     'cpu': -0.011111111111111112,
+                     'real_time': 9,
+                     'real_time_other': 10,
+                     'cpu_time': 90,
+                     'cpu_time_other': 89},
+                    {'time': -0.125, 'cpu': -0.16279069767441862, 'real_time': 8,
+                        'real_time_other': 7, 'cpu_time': 86, 'cpu_time_other': 72}
+                ],
+                'time_unit': 'ns',
+                'utest': {
+                    'have_optimal_repetitions': False, 'cpu_pvalue': 0.6666666666666666, 'time_pvalue': 1.0
+                }
+            },
+            {
+                'name': u'short',
+                'measurements': [
+                    {'time': -0.125,
+                     'cpu': -0.0625,
+                     'real_time': 8,
+                     'real_time_other': 7,
+                     'cpu_time': 80,
+                     'cpu_time_other': 75},
+                    {'time': -0.4325,
+                     'cpu': -0.13506493506493514,
+                     'real_time': 8,
+                     'real_time_other': 4.54,
+                     'cpu_time': 77,
+                     'cpu_time_other': 66.6}
+                ],
+                'time_unit': 'ns',
+                'utest': {
+                    'have_optimal_repetitions': False, 'cpu_pvalue': 0.2, 'time_pvalue': 0.7670968684102772
+                }
+            },
+            {
+                'name': u'medium',
+                'measurements': [
+                    {'time': -0.375,
+                     'cpu': -0.3375,
+                     'real_time': 8,
+                     'real_time_other': 5,
+                     'cpu_time': 80,
+                     'cpu_time_other': 53}
+                ],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'OVERALL_GEOMEAN',
+                'measurements': [{'real_time': 8.48528137423858e-09, 'cpu_time': 8.441336246629233e-08,
+                                  'real_time_other': 2.2405267593145244e-08, 'cpu_time_other': 2.5453661413660466e-08,
+                                  'time': 1.6404861082353634, 'cpu': -0.6984640740519662}],
+                'time_unit': 's',
+                'run_type': 'aggregate',
+                'aggregate_name': 'geomean',
+                'utest': {}
+            }
+        ]
+        self.assertEqual(len(self.json_diff_report), len(expected_output))
+        for out, expected in zip(
+                self.json_diff_report, expected_output):
+            self.assertEqual(out['name'], expected['name'])
+            self.assertEqual(out['time_unit'], expected['time_unit'])
+            assert_utest(self, out, expected)
+            assert_measurements(self, out, expected)
+
 
 class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly(
         unittest.TestCase):
-    def load_results(self):
-        import json
-        testInputs = os.path.join(
-            os.path.dirname(
-                os.path.realpath(__file__)),
-            'Inputs')
-        testOutput1 = os.path.join(testInputs, 'test3_run0.json')
-        testOutput2 = os.path.join(testInputs, 'test3_run1.json')
-        with open(testOutput1, 'r') as f:
-            json1 = json.load(f)
-        with open(testOutput2, 'r') as f:
-            json2 = json.load(f)
-        return json1, json2
-
-    def test_utest(self):
-        expect_lines = []
+    @classmethod
+    def setUpClass(cls):
+        def load_results():
+            import json
+            testInputs = os.path.join(
+                os.path.dirname(
+                    os.path.realpath(__file__)),
+                'Inputs')
+            testOutput1 = os.path.join(testInputs, 'test3_run0.json')
+            testOutput2 = os.path.join(testInputs, 'test3_run1.json')
+            with open(testOutput1, 'r') as f:
+                json1 = json.load(f)
+            with open(testOutput2, 'r') as f:
+                json2 = json.load(f)
+            return json1, json2
+
+        json1, json2 = load_results()
+        cls.json_diff_report = get_difference_report(
+            json1, json2, utest=True)
+
+    def test_json_diff_report_pretty_printing(self):
         expect_lines = [
             ['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'],
             ['BM_Two', '+0.1111', '-0.0111', '9', '10', '90', '89'],
             ['BM_Two', '-0.1250', '-0.1628', '8', '7', '86', '72'],
             ['BM_Two_pvalue',
-             '0.6985',
-             '0.6985',
+             '1.0000',
+             '0.6667',
              'U',
              'Test,',
              'Repetitions:',
@@ -506,7 +947,7 @@ def test_utest(self):
             ['short', '-0.4325', '-0.1351', '8', '5', '77', '67'],
             ['short_pvalue',
              '0.7671',
-             '0.1489',
+             '0.2000',
              'U',
              'Test,',
              'Repetitions:',
@@ -519,10 +960,136 @@ def test_utest(self):
              '9+',
              'repetitions',
              'recommended.'],
+            ['medium', '-0.3750', '-0.3375', '8', '5', '80', '53'],
+            ['OVERALL_GEOMEAN', '+1.6405', '-0.6985', '0', '0', '0', '0']
+        ]
+        output_lines_with_header = print_difference_report(
+            self.json_diff_report,
+            utest=True, utest_alpha=0.05, use_color=False)
+        output_lines = output_lines_with_header[2:]
+        print("\n")
+        print("\n".join(output_lines_with_header))
+        self.assertEqual(len(output_lines), len(expect_lines))
+        for i in range(0, len(output_lines)):
+            parts = [x for x in output_lines[i].split(' ') if x]
+            self.assertEqual(expect_lines[i], parts)
+
+    def test_json_diff_report(self):
+        expected_output = [
+            {
+                'name': u'BM_One',
+                'measurements': [
+                    {'time': -0.1,
+                     'cpu': 0.1,
+                     'real_time': 10,
+                     'real_time_other': 9,
+                     'cpu_time': 100,
+                     'cpu_time_other': 110}
+                ],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': u'BM_Two',
+                'measurements': [
+                    {'time': 0.1111111111111111,
+                     'cpu': -0.011111111111111112,
+                     'real_time': 9,
+                     'real_time_other': 10,
+                     'cpu_time': 90,
+                     'cpu_time_other': 89},
+                    {'time': -0.125, 'cpu': -0.16279069767441862, 'real_time': 8,
+                        'real_time_other': 7, 'cpu_time': 86, 'cpu_time_other': 72}
+                ],
+                'time_unit': 'ns',
+                'utest': {
+                    'have_optimal_repetitions': False, 'cpu_pvalue': 0.6666666666666666, 'time_pvalue': 1.0
+                }
+            },
+            {
+                'name': u'short',
+                'measurements': [
+                    {'time': -0.125,
+                     'cpu': -0.0625,
+                     'real_time': 8,
+                     'real_time_other': 7,
+                     'cpu_time': 80,
+                     'cpu_time_other': 75},
+                    {'time': -0.4325,
+                     'cpu': -0.13506493506493514,
+                     'real_time': 8,
+                     'real_time_other': 4.54,
+                     'cpu_time': 77,
+                     'cpu_time_other': 66.6}
+                ],
+                'time_unit': 'ns',
+                'utest': {
+                    'have_optimal_repetitions': False, 'cpu_pvalue': 0.2, 'time_pvalue': 0.7670968684102772
+                }
+            },
+            {
+                'name': u'medium',
+                'measurements': [
+                    {'real_time_other': 5,
+                     'cpu_time': 80,
+                     'time': -0.375,
+                     'real_time': 8,
+                     'cpu_time_other': 53,
+                     'cpu': -0.3375
+                     }
+                ],
+                'utest': {},
+                'time_unit': u'ns',
+                'aggregate_name': ''
+            },
+            {
+                'name': 'OVERALL_GEOMEAN',
+                'measurements': [{'real_time': 8.48528137423858e-09, 'cpu_time': 8.441336246629233e-08,
+                                  'real_time_other': 2.2405267593145244e-08, 'cpu_time_other': 2.5453661413660466e-08,
+                                  'time': 1.6404861082353634, 'cpu': -0.6984640740519662}],
+                'time_unit': 's',
+                'run_type': 'aggregate',
+                'aggregate_name': 'geomean',
+                'utest': {}
+            }
         ]
-        json1, json2 = self.load_results()
-        output_lines_with_header = generate_difference_report(
-            json1, json2, display_aggregates_only=True,
+        self.assertEqual(len(self.json_diff_report), len(expected_output))
+        for out, expected in zip(
+                self.json_diff_report, expected_output):
+            self.assertEqual(out['name'], expected['name'])
+            self.assertEqual(out['time_unit'], expected['time_unit'])
+            assert_utest(self, out, expected)
+            assert_measurements(self, out, expected)
+
+
+class TestReportDifferenceForPercentageAggregates(
+        unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        def load_results():
+            import json
+            testInputs = os.path.join(
+                os.path.dirname(
+                    os.path.realpath(__file__)),
+                'Inputs')
+            testOutput1 = os.path.join(testInputs, 'test4_run0.json')
+            testOutput2 = os.path.join(testInputs, 'test4_run1.json')
+            with open(testOutput1, 'r') as f:
+                json1 = json.load(f)
+            with open(testOutput2, 'r') as f:
+                json2 = json.load(f)
+            return json1, json2
+
+        json1, json2 = load_results()
+        cls.json_diff_report = get_difference_report(
+            json1, json2, utest=True)
+
+    def test_json_diff_report_pretty_printing(self):
+        expect_lines = [
+            ['whocares', '-0.5000', '+0.5000', '0', '0', '0', '0']
+        ]
+        output_lines_with_header = print_difference_report(
+            self.json_diff_report,
             utest=True, utest_alpha=0.05, use_color=False)
         output_lines = output_lines_with_header[2:]
         print("\n")
@@ -532,6 +1099,99 @@ def test_utest(self):
             parts = [x for x in output_lines[i].split(' ') if x]
             self.assertEqual(expect_lines[i], parts)
 
+    def test_json_diff_report(self):
+        expected_output = [
+            {
+                'name': u'whocares',
+                'measurements': [
+                    {'time': -0.5,
+                     'cpu': 0.5,
+                     'real_time': 0.01,
+                     'real_time_other': 0.005,
+                     'cpu_time': 0.10,
+                     'cpu_time_other': 0.15}
+                ],
+                'time_unit': 'ns',
+                'utest': {}
+            }
+        ]
+        self.assertEqual(len(self.json_diff_report), len(expected_output))
+        for out, expected in zip(
+                self.json_diff_report, expected_output):
+            self.assertEqual(out['name'], expected['name'])
+            self.assertEqual(out['time_unit'], expected['time_unit'])
+            assert_utest(self, out, expected)
+            assert_measurements(self, out, expected)
+
+
+class TestReportSorting(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        def load_result():
+            import json
+            testInputs = os.path.join(
+                os.path.dirname(
+                    os.path.realpath(__file__)),
+                'Inputs')
+            testOutput = os.path.join(testInputs, 'test4_run.json')
+            with open(testOutput, 'r') as f:
+                json = json.load(f)
+            return json
+
+        cls.json = load_result()
+
+    def test_json_diff_report_pretty_printing(self):
+        import util
+
+        expected_names = [
+            "99 family 0 instance 0 repetition 0",
+            "98 family 0 instance 0 repetition 1",
+            "97 family 0 instance 0 aggregate",
+            "96 family 0 instance 1 repetition 0",
+            "95 family 0 instance 1 repetition 1",
+            "94 family 0 instance 1 aggregate",
+            "93 family 1 instance 0 repetition 0",
+            "92 family 1 instance 0 repetition 1",
+            "91 family 1 instance 0 aggregate",
+            "90 family 1 instance 1 repetition 0",
+            "89 family 1 instance 1 repetition 1",
+            "88 family 1 instance 1 aggregate"
+        ]
+
+        for n in range(len(self.json['benchmarks']) ** 2):
+            random.shuffle(self.json['benchmarks'])
+            sorted_benchmarks = util.sort_benchmark_results(self.json)[
+                'benchmarks']
+            self.assertEqual(len(expected_names), len(sorted_benchmarks))
+            for out, expected in zip(sorted_benchmarks, expected_names):
+                self.assertEqual(out['name'], expected)
+
+
+def assert_utest(unittest_instance, lhs, rhs):
+    if lhs['utest']:
+        unittest_instance.assertAlmostEqual(
+            lhs['utest']['cpu_pvalue'],
+            rhs['utest']['cpu_pvalue'])
+        unittest_instance.assertAlmostEqual(
+            lhs['utest']['time_pvalue'],
+            rhs['utest']['time_pvalue'])
+        unittest_instance.assertEqual(
+            lhs['utest']['have_optimal_repetitions'],
+            rhs['utest']['have_optimal_repetitions'])
+    else:
+        # lhs is empty. assert if rhs is not.
+        unittest_instance.assertEqual(lhs['utest'], rhs['utest'])
+
+
+def assert_measurements(unittest_instance, lhs, rhs):
+    for m1, m2 in zip(lhs['measurements'], rhs['measurements']):
+        unittest_instance.assertEqual(m1['real_time'], m2['real_time'])
+        unittest_instance.assertEqual(m1['cpu_time'], m2['cpu_time'])
+        # m1['time'] and m1['cpu'] hold values which are being calculated,
+        # and therefore we must use almost-equal pattern.
+        unittest_instance.assertAlmostEqual(m1['time'], m2['time'], places=4)
+        unittest_instance.assertAlmostEqual(m1['cpu'], m2['cpu'], places=4)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/ThirdParty/googlebenchmark/tools/gbench/util.py b/ThirdParty/googlebenchmark/tools/gbench/util.py
index 1f8e8e2c47..5e79da8f01 100644
--- a/ThirdParty/googlebenchmark/tools/gbench/util.py
+++ b/ThirdParty/googlebenchmark/tools/gbench/util.py
@@ -2,9 +2,11 @@
 """
 import json
 import os
-import tempfile
+import re
 import subprocess
 import sys
+import tempfile
+
 
 # Input file type enumeration
 IT_Invalid = 0
@@ -57,7 +59,7 @@ def classify_input_file(filename):
     """
     Return a tuple (type, msg) where 'type' specifies the classified type
     of 'filename'. If 'type' is 'IT_Invalid' then 'msg' is a human readable
-    string represeting the error.
+    string representing the error.
     """
     ftype = IT_Invalid
     err_msg = None
@@ -110,13 +112,49 @@ def remove_benchmark_flags(prefix, benchmark_flags):
     return [f for f in benchmark_flags if not f.startswith(prefix)]
 
 
-def load_benchmark_results(fname):
+def load_benchmark_results(fname, benchmark_filter):
     """
     Read benchmark output from a file and return the JSON object.
+
+    Apply benchmark_filter, a regular expression, with nearly the same
+    semantics of the --benchmark_filter argument.  May be None.
+    Note: the Python regular expression engine is used instead of the
+    one used by the C++ code, which may produce different results
+    in complex cases.
+
     REQUIRES: 'fname' names a file containing JSON benchmark output.
     """
+    def benchmark_wanted(benchmark):
+        if benchmark_filter is None:
+            return True
+        name = benchmark.get('run_name', None) or benchmark['name']
+        if re.search(benchmark_filter, name):
+            return True
+        return False
+
     with open(fname, 'r') as f:
-        return json.load(f)
+        results = json.load(f)
+        if 'benchmarks' in results:
+            results['benchmarks'] = list(filter(benchmark_wanted,
+                                                results['benchmarks']))
+        return results
+
+
+def sort_benchmark_results(result):
+    benchmarks = result['benchmarks']
+
+    # From inner key to the outer key!
+    benchmarks = sorted(
+        benchmarks, key=lambda benchmark: benchmark['repetition_index'] if 'repetition_index' in benchmark else -1)
+    benchmarks = sorted(
+        benchmarks, key=lambda benchmark: 1 if 'run_type' in benchmark and benchmark['run_type'] == "aggregate" else 0)
+    benchmarks = sorted(
+        benchmarks, key=lambda benchmark: benchmark['per_family_instance_index'] if 'per_family_instance_index' in benchmark else -1)
+    benchmarks = sorted(
+        benchmarks, key=lambda benchmark: benchmark['family_index'] if 'family_index' in benchmark else -1)
+
+    result['benchmarks'] = benchmarks
+    return result
 
 
 def run_benchmark(exe_name, benchmark_flags):
@@ -142,7 +180,7 @@ def run_benchmark(exe_name, benchmark_flags):
     if exitCode != 0:
         print('TEST FAILED...')
         sys.exit(exitCode)
-    json_res = load_benchmark_results(output_name)
+    json_res = load_benchmark_results(output_name, None)
     if is_temp_output:
         os.unlink(output_name)
     return json_res
@@ -157,8 +195,9 @@ def run_or_load_benchmark(filename, benchmark_flags):
     """
     ftype = check_input_file(filename)
     if ftype == IT_JSON:
-        return load_benchmark_results(filename)
-    elif ftype == IT_Executable:
+        benchmark_filter = find_benchmark_flag('--benchmark_filter=',
+                                               benchmark_flags)
+        return load_benchmark_results(filename, benchmark_filter)
+    if ftype == IT_Executable:
         return run_benchmark(filename, benchmark_flags)
-    else:
-        assert False  # This branch is unreachable
+    raise ValueError('Unknown file type %s' % ftype)
diff --git a/ThirdParty/googlebenchmark/tools/strip_asm.py b/ThirdParty/googlebenchmark/tools/strip_asm.py
index 9030550b43..d131dc7194 100755
--- a/ThirdParty/googlebenchmark/tools/strip_asm.py
+++ b/ThirdParty/googlebenchmark/tools/strip_asm.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 """
 strip_asm.py - Cleanup ASM output for the specified file
diff --git a/ThirdParty/licenses/index.md b/ThirdParty/licenses/index.md
index dc542beb0d..0a3607a99a 100644
--- a/ThirdParty/licenses/index.md
+++ b/ThirdParty/licenses/index.md
@@ -37,6 +37,7 @@ zlib | [zlib](<http://zlib.net/zlib_license.html>) | PNG support |
 glslang | [BSD BSD-like MIT](https://github.com/KhronosGroup/glslang/blob/master/LICENSE.txt) | Rendering support
 spirv-tools | [Apache-2.0](https://github.com/KhronosGroup/SPIRV-Tools/blob/master/LICENSE) | Rendering support
 spirv-cross | [Apache-2.0](https://github.com/KhronosGroup/SPIRV-Cross/blob/master/LICENSE) | Rendering support
+abseil | [Apache-2.0](https://github.com/abseil/abseil-cpp/blob/master/LICENSE) | Rendering support
 Vulkan Loader | [Apache-2.0](https://github.com/KhronosGroup/Vulkan-Loader/blob/main/LICENSE.txt) | Rendering support
 Vulkan Memory Allocator | [MIT](https://github.com/GPUOpen-LibrariesAndSDKs/VulkanMemoryAllocator/blob/master/LICENSE.txt) | Rendering support
 GLM | [MIT](https://github.com/g-truc/glm/blob/master/copying.txt) | Rendering support
@@ -163,5 +164,5 @@ libuv | [MIT](<https://github.com/libuv/libuv/blob/v1.x/LICENSE>) | Distributed
 --- | --- | --- |
 STB | [MIT](https://github.com/nothings/stb/blob/master/LICENSE) | Image file utils
 --- | --- | --- |
-| poly2tri  | [BSD](https://github.com/greenm01/poly2tri/blob/master/LICENSE) | GDAL, Polygon triangulation  |
+| earcut  | [ISC](https://github.com/mapbox/earcut.hpp/blob/master/LICENSE) | Omniverse Export Polygon Triangulation |
 | kdtree-cpp | [BSD-3-Clause](https://github.com/cdalitz/kdtree-cpp/blob/master/LICENSE) | Cross-Section 2D |
diff --git a/ThirdParty/licenses/index.txt b/ThirdParty/licenses/index.txt
index ed7f2e2a49..3bcf8c764a 100644
--- a/ThirdParty/licenses/index.txt
+++ b/ThirdParty/licenses/index.txt
@@ -37,6 +37,7 @@
   glslang                                   [BSD BSD-like MIT](https://github.com/KhronosGroup/glslang/blob/master/LICENSE.txt)                Rendering support
   spirv-tools                               [Apache-2.0](https://github.com/KhronosGroup/SPIRV-Tools/blob/master/LICENSE)                      Rendering support
   spirv-cross                               [Apache-2.0](https://github.com/KhronosGroup/SPIRV-Cross/blob/master/LICENSE)                      Rendering support
+  abseil                                    [Apache-2.0](https://github.com/abseil/abseil-cpp/blob/master/LICENSE)                             Rendering support
   Vulkan Loader                             [Apache-2.0](https://github.com/KhronosGroup/Vulkan-Loader/blob/main/LICENSE.txt)                  Rendering support
   Vulkan Memory Allocator                   [MIT](https://github.com/GPUOpen-LibrariesAndSDKs/VulkanMemoryAllocator/blob/master/LICENSE.txt)   Rendering support
   GLM                                       [MIT](https://github.com/g-truc/glm/blob/master/copying.txt)                                       Rendering support
@@ -163,5 +164,5 @@
   ---                                       ---                                                                                                ---
   STB                                       [MIT](https://github.com/nothings/stb/blob/master/LICENSE)                                         Image file utils
   ---                                       ---                                                                                                ---
-  poly2tri                                  [BSD](https://github.com/greenm01/poly2tri/blob/master/LICENSE)                                    GDAL, Polygon triangulation
+  earcut                                    [ISC](https://github.com/mapbox/earcut.hpp/blob/master/LICENSE)                                    Omniverse Export Polygon Triangulation
   kdtree-cpp                                [BSD-3-Clause](https://github.com/cdalitz/kdtree-cpp/blob/master/LICENSE)                          Cross-Section 2D
diff --git a/ThriftHandler/CommandLineOptions.cpp b/ThriftHandler/CommandLineOptions.cpp
index f1e6df4623..e2ddc580e9 100644
--- a/ThriftHandler/CommandLineOptions.cpp
+++ b/ThriftHandler/CommandLineOptions.cpp
@@ -64,15 +64,20 @@ extern size_t g_parallel_top_min;
 extern size_t g_parallel_top_max;
 extern size_t g_streaming_topn_max;
 extern size_t g_estimator_failure_max_groupby_size;
+extern double g_ndv_groups_estimator_multiplier;
 extern bool g_columnar_large_projections;
 extern size_t g_columnar_large_projections_threshold;
 extern bool g_enable_system_tables;
 extern bool g_allow_system_dashboard_update;
 extern bool g_allow_memory_status_log;
 extern bool g_enable_logs_system_tables;
+extern bool g_enable_logs_system_tables_auto_refresh;
+extern std::string g_logs_system_tables_refresh_interval;
 extern size_t g_logs_system_tables_max_files_count;
 extern bool g_uniform_request_ids_per_thrift_call;
 extern size_t g_gpu_code_cache_max_size_in_bytes;
+extern bool g_use_cpu_mem_pool_for_output_buffers;
+extern bool g_use_cpu_mem_pool_size_for_max_cpu_slab_size;
 
 #ifdef ENABLE_MEMKIND
 extern std::string g_pmem_path;
@@ -424,8 +429,8 @@ void CommandLineOptions::fillOptions() {
                          ->implicit_value(true),
                      "Enable watchdog.");
   desc.add_options()("watchdog-max-projected-rows-per-device",
-                     po::value<size_t>(&watchdog_max_projected_rows_per_device)
-                         ->default_value(watchdog_max_projected_rows_per_device),
+                     po::value<size_t>(&g_watchdog_max_projected_rows_per_device)
+                         ->default_value(g_watchdog_max_projected_rows_per_device),
                      "Max number of rows allowed to be projected when running a query "
                      "with watchdog enabled.");
   desc.add_options()(
@@ -433,11 +438,12 @@ void CommandLineOptions::fillOptions() {
       po::value<size_t>(&preflight_count_query_threshold)
           ->default_value(preflight_count_query_threshold),
       "Threshold to run pre-flight count query which computes # output rows accurately.");
-  desc.add_options()("watchdog-none-encoded-string-translation-limit",
-                     po::value<size_t>(&watchdog_none_encoded_string_translation_limit)
-                         ->default_value(watchdog_none_encoded_string_translation_limit),
-                     "Max number of none-encoded strings allowed to be translated "
-                     "to dictionary-encoded with watchdog enabled");
+  desc.add_options()(
+      "watchdog-none-encoded-string-translation-limit",
+      po::value<size_t>(&g_watchdog_none_encoded_string_translation_limit)
+          ->default_value(g_watchdog_none_encoded_string_translation_limit),
+      "Max number of none-encoded strings allowed to be translated "
+      "to dictionary-encoded with watchdog enabled");
   desc.add_options()("filter-push-down-low-frac",
                      po::value<float>(&g_filter_push_down_low_frac)
                          ->default_value(g_filter_push_down_low_frac)
@@ -469,6 +475,26 @@ void CommandLineOptions::fillOptions() {
                          ->default_value(system_parameters.gpu_input_mem_limit),
                      "Force query to CPU when input data memory usage exceeds this "
                      "percentage of available GPU memory.");
+  desc.add_options()("watchdog-in-clause-max-num-elem-non-bitmap",
+                     po::value<size_t>(&g_watchdog_in_clause_max_num_elem_non_bitmap)
+                         ->default_value(g_watchdog_in_clause_max_num_elem_non_bitmap),
+                     "Max number of unique values allowed to process IN-clause without "
+                     "using a bitmap when watchdog is enabled.");
+  desc.add_options()("watchdog-in-clause-max-num-elem-bitmap",
+                     po::value<size_t>(&g_watchdog_in_clause_max_num_elem_bitmap)
+                         ->default_value(g_watchdog_in_clause_max_num_elem_bitmap),
+                     "Max number of unique values allowed to "
+                     "process IN-clause using a bitmap when watchdog is enabled.");
+  desc.add_options()(
+      "watchdog-in-clause-max-num-input-rows",
+      po::value<size_t>(&g_watchdog_in_clause_max_num_input_rows)
+          ->default_value(g_watchdog_in_clause_max_num_input_rows),
+      "Max number of input rows allowed to process IN-clause when watchdog is enabled");
+  desc.add_options()("in-clause-num-elem-skip-bitmap",
+                     po::value<size_t>(&g_in_clause_num_elem_skip_bitmap)
+                         ->default_value(g_in_clause_num_elem_skip_bitmap),
+                     "# values to skip constructing a bitmap to process IN-clause");
+
   desc.add_options()(
       "hll-precision-bits",
       po::value<int>(&g_hll_precision_bits)
@@ -718,6 +744,16 @@ void CommandLineOptions::fillOptions() {
                          ->default_value(g_enable_logs_system_tables)
                          ->implicit_value(true),
                      "Enable use of logs system tables.");
+  desc.add_options()("enable-logs-system-tables-auto-refresh",
+                     po::value<bool>(&g_enable_logs_system_tables_auto_refresh)
+                         ->default_value(g_enable_logs_system_tables_auto_refresh)
+                         ->implicit_value(true),
+                     "Enable automatic refreshes of logs system tables.");
+  desc.add_options()("logs-system-tables-refresh-interval",
+                     po::value<std::string>(&g_logs_system_tables_refresh_interval)
+                         ->default_value(g_logs_system_tables_refresh_interval),
+                     "Refresh interval for logs system tables. Interval should have the "
+                     "following format: nS, nH, or nD");
   desc.add_options()(
       "logs-system-tables-max-files-count",
       po::value<size_t>(&g_logs_system_tables_max_files_count)
@@ -783,6 +819,16 @@ void CommandLineOptions::fillDeveloperOptions() {
                          ->default_value(g_enable_smem_group_by)
                          ->implicit_value(true),
                      "Enable using GPU shared memory for some GROUP BY queries.");
+  desc.add_options()(
+      "use-cpu-mem-pool-for-output-buffers",
+      po::value<bool>(&g_use_cpu_mem_pool_for_output_buffers)
+          ->default_value(g_use_cpu_mem_pool_for_output_buffers)
+          ->implicit_value(true),
+      "Use the CPU memory buffer pool (whose capacity is determined by the "
+      "cpu-buffer-mem-bytes configuration parameter) for output buffer allocations. "
+      "When this configuration parameter is set to false, output (e.g. result set) "
+      "buffer allocations will use heap memory outside the cpu-buffer-mem-bytes based "
+      "memory buffer pool.");
   desc.add_options()("num-executors",
                      po::value<int>(&system_parameters.num_executors)
                          ->default_value(system_parameters.num_executors),
@@ -924,6 +970,13 @@ void CommandLineOptions::fillDeveloperOptions() {
       "there is not enough free memory to accomodate the target slab size, smaller "
       "slabs will be allocated, down to the minimum size specified by "
       "min-cpu-slab-size.");
+  desc.add_options()("default-cpu-slab-size",
+                     po::value<size_t>(&system_parameters.default_cpu_slab_size)
+                         ->default_value(system_parameters.default_cpu_slab_size),
+                     "Default CPU buffer pool slab size (size of memory allocations). "
+                     "Note that allocations above this size are allowed up to the size "
+                     "specified by max-cpu-slab-size.");
+
   desc.add_options()("min-gpu-slab-size",
                      po::value<size_t>(&system_parameters.min_gpu_slab_size)
                          ->default_value(system_parameters.min_gpu_slab_size),
@@ -936,6 +989,12 @@ void CommandLineOptions::fillDeveloperOptions() {
       "there is not enough free memory to accomodate the target slab size, smaller "
       "slabs will be allocated, down to the minimum size speified by "
       "min-gpu-slab-size.");
+  desc.add_options()("default-gpu-slab-size",
+                     po::value<size_t>(&system_parameters.default_gpu_slab_size)
+                         ->default_value(system_parameters.default_gpu_slab_size),
+                     "Default GPU buffer pool slab size (size of memory allocations). "
+                     "Note that allocations above this size are allowed up to the size "
+                     "specified by max-gpu-slab-size.");
 
   desc.add_options()(
       "max-output-projection-allocation-bytes",
@@ -1133,6 +1192,12 @@ void CommandLineOptions::fillDeveloperOptions() {
           ->default_value(g_estimator_failure_max_groupby_size),
       "Maximum size of the groupby buffer if the estimator fails. By default we use the "
       "number of tuples in the table up to this value.");
+  desc.add_options()("ndv-group-estimator-multiplier",
+                     po::value<double>(&g_ndv_groups_estimator_multiplier)
+                         ->default_value(g_ndv_groups_estimator_multiplier),
+                     "A non-negative threshold to control the result of ndv group "
+                     "estimator (default: 2.0). The value must be between 1.0 and 2.0");
+
   desc.add_options()("columnar-large-projections",
                      po::value<bool>(&g_columnar_large_projections)
                          ->default_value(g_columnar_large_projections)
@@ -1590,6 +1655,25 @@ void CommandLineOptions::validate() {
   LOG(INFO) << "Enable FSI is set to " << g_enable_fsi;
   LOG(INFO) << "Enable logs system tables set to " << g_enable_logs_system_tables;
 
+  if (g_enable_foreign_table_scheduled_refresh) {
+    LOG(INFO) << "Enable logs system tables auto refresh set to "
+              << g_enable_logs_system_tables_auto_refresh;
+  } else {
+    g_enable_logs_system_tables_auto_refresh = false;
+    LOG(INFO) << "Logs system tables auto refresh has been disabled as a side effect of "
+                 "disabling foreign table scheduled refresh";
+  }
+
+  static const boost::regex interval_regex{"^\\d{1,}[SHD]$",
+                                           boost::regex::extended | boost::regex::icase};
+  if (!boost::regex_match(g_logs_system_tables_refresh_interval, interval_regex)) {
+    throw std::runtime_error{
+        "Invalid interval value provided for the \"logs-system-tables-refresh-interval\" "
+        "option. Interval should have the following format: nS, nH, or nD"};
+  }
+  LOG(INFO) << "Logs system tables refresh interval set to "
+            << g_logs_system_tables_refresh_interval;
+
   if (g_logs_system_tables_max_files_count == 0) {
     throw std::runtime_error{
         "Invalid value provided for the \"logs-system-tables-max-files-count\" "
@@ -1612,6 +1696,56 @@ void CommandLineOptions::validate() {
     }
   }
 #endif
+
+  if (g_ndv_groups_estimator_multiplier < 1.0 ||
+      g_ndv_groups_estimator_multiplier > 2.0) {
+    throw std::runtime_error(
+        "Invalid value provided for the \"ndv-groups-estimator-correction\" option. "
+        "Value must be between 1.0 and 2.0");
+  }
+
+  // Check for the g_use_cpu_mem_pool_size_for_max_cpu_slab_size flag, since DataMgr
+  // ensures that min_cpu_slab_size cannot be greater than the buffer pool size.
+  if (!g_use_cpu_mem_pool_size_for_max_cpu_slab_size &&
+      system_parameters.max_cpu_slab_size < system_parameters.min_cpu_slab_size) {
+    throw std::runtime_error("max-cpu-slab-size (" +
+                             std::to_string(system_parameters.max_cpu_slab_size) +
+                             ") cannot be less than min-cpu-slab-size (" +
+                             std::to_string(system_parameters.min_cpu_slab_size) + ").");
+  }
+  if (system_parameters.default_cpu_slab_size < system_parameters.min_cpu_slab_size) {
+    throw std::runtime_error("default-cpu-slab-size (" +
+                             std::to_string(system_parameters.default_cpu_slab_size) +
+                             ") cannot be less than min-cpu-slab-size (" +
+                             std::to_string(system_parameters.min_cpu_slab_size) + ").");
+  }
+  // Check for the g_use_cpu_mem_pool_size_for_max_cpu_slab_size flag, since DataMgr
+  // ensures that default_cpu_slab_size cannot be greater than the buffer pool size.
+  if (!g_use_cpu_mem_pool_size_for_max_cpu_slab_size &&
+      system_parameters.default_cpu_slab_size > system_parameters.max_cpu_slab_size) {
+    throw std::runtime_error("default-cpu-slab-size (" +
+                             std::to_string(system_parameters.default_cpu_slab_size) +
+                             ") cannot be greater than max-cpu-slab-size (" +
+                             std::to_string(system_parameters.max_cpu_slab_size) + ").");
+  }
+  if (system_parameters.max_gpu_slab_size < system_parameters.min_gpu_slab_size) {
+    throw std::runtime_error("max-gpu-slab-size (" +
+                             std::to_string(system_parameters.max_gpu_slab_size) +
+                             ") cannot be less than min-gpu-slab-size (" +
+                             std::to_string(system_parameters.min_gpu_slab_size) + ").");
+  }
+  if (system_parameters.default_gpu_slab_size < system_parameters.min_gpu_slab_size) {
+    throw std::runtime_error("default-gpu-slab-size (" +
+                             std::to_string(system_parameters.default_gpu_slab_size) +
+                             ") cannot be less than min-gpu-slab-size (" +
+                             std::to_string(system_parameters.min_gpu_slab_size) + ").");
+  }
+  if (system_parameters.default_gpu_slab_size > system_parameters.max_gpu_slab_size) {
+    throw std::runtime_error("default-gpu-slab-size (" +
+                             std::to_string(system_parameters.default_gpu_slab_size) +
+                             ") cannot be greater than max-gpu-slab-size (" +
+                             std::to_string(system_parameters.max_gpu_slab_size) + ").");
+  }
 }
 
 SystemParameters::RuntimeUdfRegistrationPolicy construct_runtime_udf_registration_policy(
@@ -1763,8 +1897,6 @@ boost::optional<int> CommandLineOptions::parse_command_line(
     }
 
     g_enable_watchdog = enable_watchdog;
-    g_watchdog_none_encoded_string_translation_limit =
-        watchdog_none_encoded_string_translation_limit;
     g_watchdog_max_projected_rows_per_device = watchdog_max_projected_rows_per_device;
     g_preflight_count_query_threshold = preflight_count_query_threshold;
     g_enable_dynamic_watchdog = enable_dynamic_watchdog;
@@ -1892,14 +2024,47 @@ boost::optional<int> CommandLineOptions::parse_command_line(
   if (system_parameters.cuda_grid_size) {
     LOG(INFO) << " cuda grid size " << system_parameters.cuda_grid_size;
   }
+
+  if (g_use_cpu_mem_pool_for_output_buffers) {
+    if (vm["max-cpu-slab-size"].defaulted()) {
+      LOG(INFO)
+          << "max-cpu-slab-size is not set while use-cpu-mem-pool-for-output-buffers is "
+             "true. Using the CPU memory buffer pool size for the max CPU slab size.";
+      g_use_cpu_mem_pool_size_for_max_cpu_slab_size = true;
+    }
+  } else {
+    if (!vm["max-cpu-slab-size"].defaulted() && vm["default-cpu-slab-size"].defaulted()) {
+      LOG(INFO)
+          << "default-cpu-slab-size is not set while max-cpu-slab-size is set. "
+             "Setting default-cpu-slab-size to the same value as max-cpu-slab-size ("
+          << system_parameters.max_cpu_slab_size << " bytes)";
+      system_parameters.default_cpu_slab_size = system_parameters.max_cpu_slab_size;
+    }
+  }
+
+  if (!vm["max-gpu-slab-size"].defaulted() && vm["default-gpu-slab-size"].defaulted()) {
+    LOG(INFO) << "default-gpu-slab-size is not set while max-gpu-slab-size is set. "
+                 "Setting default-gpu-slab-size to the same value as max-gpu-slab-size ("
+              << system_parameters.max_gpu_slab_size << " bytes)";
+    system_parameters.default_gpu_slab_size = system_parameters.max_gpu_slab_size;
+  }
+
   LOG(INFO) << " Min CPU buffer pool slab size (in bytes) "
             << system_parameters.min_cpu_slab_size;
-  LOG(INFO) << " Max CPU buffer pool slab size (in bytes) "
-            << system_parameters.max_cpu_slab_size;
+  if (g_use_cpu_mem_pool_size_for_max_cpu_slab_size) {
+    LOG(INFO) << " Max CPU buffer pool slab size is set to the CPU buffer pool size";
+  } else {
+    LOG(INFO) << " Max CPU buffer pool slab size (in bytes) "
+              << system_parameters.max_cpu_slab_size;
+  }
+  LOG(INFO) << " Default CPU buffer pool slab size (in bytes) "
+            << system_parameters.default_cpu_slab_size;
   LOG(INFO) << " Min GPU buffer pool slab size (in bytes) "
             << system_parameters.min_gpu_slab_size;
   LOG(INFO) << " Max GPU buffer pool slab size (in bytes) "
             << system_parameters.max_gpu_slab_size;
+  LOG(INFO) << " Default GPU buffer pool slab size (in bytes) "
+            << system_parameters.default_gpu_slab_size;
   LOG(INFO) << " calcite JVM max memory (in MB) " << system_parameters.calcite_max_mem;
   LOG(INFO) << " HeavyDB Server Port " << system_parameters.omnisci_server_port;
   LOG(INFO) << " HeavyDB Calcite Port " << system_parameters.calcite_port;
@@ -1941,6 +2106,11 @@ boost::optional<int> CommandLineOptions::parse_command_line(
     LOG(INFO) << " \t Use chunk metadata cache: "
               << (g_use_chunk_metadata_cache ? "enabled" : "disabled");
   }
+  LOG(INFO) << "Number of executors is set to " << system_parameters.num_executors;
+
+  LOG(INFO) << "Use CPU memory pool for output buffers is set to "
+            << g_use_cpu_mem_pool_for_output_buffers;
+
   LOG(INFO) << "Executor Resource Manager: "
             << (g_enable_executor_resource_mgr ? "enabled" : "disabled");
   if (g_enable_executor_resource_mgr) {
diff --git a/ThriftHandler/CommandLineOptions.h b/ThriftHandler/CommandLineOptions.h
index 7626f2d895..c75738e4bf 100644
--- a/ThriftHandler/CommandLineOptions.h
+++ b/ThriftHandler/CommandLineOptions.h
@@ -39,6 +39,10 @@ class LeafHostInfo;
 
 extern size_t g_watchdog_max_projected_rows_per_device;
 extern size_t g_preflight_count_query_threshold;
+extern size_t g_watchdog_in_clause_max_num_elem_non_bitmap;
+extern size_t g_watchdog_in_clause_max_num_elem_bitmap;
+extern size_t g_watchdog_in_clause_max_num_input_rows;
+extern size_t g_in_clause_num_elem_skip_bitmap;
 
 class CommandLineOptions {
  public:
@@ -177,6 +181,7 @@ extern unsigned g_dynamic_watchdog_time_limit;
 extern unsigned g_trivial_loop_join_threshold;
 extern size_t g_watchdog_none_encoded_string_translation_limit;
 extern size_t g_watchdog_max_projected_rows_per_device;
+extern size_t g_watchdog_in_subquery_max_in_values;
 extern size_t g_preflight_count_query_threshold;
 extern bool g_from_table_reordering;
 extern bool g_enable_filter_push_down;
diff --git a/ThriftHandler/DBHandler.cpp b/ThriftHandler/DBHandler.cpp
index 4e37aa0759..50972c7572 100644
--- a/ThriftHandler/DBHandler.cpp
+++ b/ThriftHandler/DBHandler.cpp
@@ -4042,6 +4042,7 @@ import_export::CopyParams DBHandler::thrift_to_copyparams(const TCopyParams& cp)
   copy_params.add_metadata_columns = cp.add_metadata_columns;
   copy_params.trim_spaces = cp.trim_spaces;
   copy_params.geo_validate_geometry = cp.geo_validate_geometry;
+  copy_params.raster_drop_if_all_null = cp.raster_drop_if_all_null;
   return copy_params;
 }
 
@@ -4174,6 +4175,7 @@ TCopyParams DBHandler::copyparams_to_thrift(const import_export::CopyParams& cp)
   copy_params.add_metadata_columns = cp.add_metadata_columns;
   copy_params.trim_spaces = cp.trim_spaces;
   copy_params.geo_validate_geometry = cp.geo_validate_geometry;
+  copy_params.raster_drop_if_all_null = cp.raster_drop_if_all_null;
   return copy_params;
 }
 
@@ -6463,10 +6465,19 @@ void DBHandler::sql_execute_impl(ExecutionResult& _return,
   std::ostringstream oss;
   oss << query_substr << post_fix;
   auto const reduced_query_str = oss.str();
-  log_system_cpu_memory_status("Start query execution: " + reduced_query_str, cat);
-  ScopeGuard cpu_system_memory_logging = [&cat, &reduced_query_str]() {
-    log_system_cpu_memory_status("Finish query execution: " + reduced_query_str, cat);
+  bool show_cpu_memory_stat_after_finishing_query = false;
+  ScopeGuard cpu_system_memory_logging = [&show_cpu_memory_stat_after_finishing_query,
+                                          &cat,
+                                          &reduced_query_str]() {
+    if (show_cpu_memory_stat_after_finishing_query) {
+      log_system_cpu_memory_status("Finish query execution: " + reduced_query_str, cat);
+    }
   };
+  auto log_cpu_memory_status =
+      [&reduced_query_str, &cat, &show_cpu_memory_stat_after_finishing_query]() {
+        log_system_cpu_memory_status("Start query execution: " + reduced_query_str, cat);
+        show_cpu_memory_stat_after_finishing_query = true;
+      };
 
   // test to see if db/catalog is writable before execution of a writable SQL/DDL command
   //   TODO: move to execute() (?)
@@ -6493,6 +6504,7 @@ void DBHandler::sql_execute_impl(ExecutionResult& _return,
     CHECK(ddl_query.HasMember("payload"));
     CHECK(ddl_query["payload"].IsObject());
     auto stmt = Parser::InsertIntoTableAsSelectStmt(ddl_query["payload"].GetObject());
+    log_cpu_memory_status();
     _return.addExecutionTime(
         measure<>::execution([&]() { stmt.execute(*session_ptr, read_only_); }));
     return;
@@ -6514,6 +6526,7 @@ void DBHandler::sql_execute_impl(ExecutionResult& _return,
       CHECK(ddl_query.HasMember("payload"));
       CHECK(ddl_query["payload"].IsObject());
       auto stmt = Parser::CreateTableAsSelectStmt(ddl_query["payload"].GetObject());
+      log_cpu_memory_status();
       _return.addExecutionTime(
           measure<>::execution([&]() { stmt.execute(*session_ptr, read_only_); }));
     }
@@ -6533,6 +6546,9 @@ void DBHandler::sql_execute_impl(ExecutionResult& _return,
     CHECK(ddl_query.HasMember("payload"));
     CHECK(ddl_query["payload"].IsObject());
     auto stmt = Parser::InsertValuesStmt(cat, ddl_query["payload"].GetObject());
+    if (stmt.get_value_lists().size() > 1) {
+      log_cpu_memory_status();
+    }
     _return.addExecutionTime(
         measure<>::execution([&]() { stmt.execute(*session_ptr, read_only_); }));
     return;
@@ -6586,6 +6602,7 @@ void DBHandler::sql_execute_impl(ExecutionResult& _return,
         _return.addExecutionTime(measure<>::execution(
             [&]() { execute_distributed_copy_statement(import_stmt, *session_ptr); }));
       } else {
+        log_cpu_memory_status();
         _return.addExecutionTime(measure<>::execution(
             [&]() { import_stmt->execute(*session_ptr, read_only_); }));
       }
@@ -6755,7 +6772,7 @@ void DBHandler::sql_execute_impl(ExecutionResult& _return,
           executor->checkPendingQueryStatus(query_session);
         } catch (QueryExecutionError& e) {
           executor->clearQuerySessionStatus(query_session, submitted_time_str);
-          if (e.getErrorCode() == Executor::ERR_INTERRUPTED) {
+          if (e.hasErrorCode(ErrorCode::INTERRUPTED)) {
             throw std::runtime_error(
                 "Query execution has been interrupted (pending query).");
           }
@@ -6764,6 +6781,7 @@ void DBHandler::sql_execute_impl(ExecutionResult& _return,
         std::this_thread::sleep_for(std::chrono::milliseconds(10));
       }
     }
+    log_cpu_memory_status();
     dispatch_queue_->submit(execute_rel_alg_task,
                             pw.getDMLType() == ParserWrapper::DMLType::Update ||
                                 pw.getDMLType() == ParserWrapper::DMLType::Delete);
diff --git a/Utils/StringLike.cpp b/Utils/StringLike.cpp
index 449c34cb90..a3ee12fe83 100644
--- a/Utils/StringLike.cpp
+++ b/Utils/StringLike.cpp
@@ -38,10 +38,13 @@ DEVICE static int inline lowercase(char c) {
   return c;
 }
 
+// escape_char does nothing, it's a placeholder to fit # arguments for both
+// string_like and string_like_simple functions
 extern "C" RUNTIME_EXPORT DEVICE bool string_like_simple(const char* str,
                                                          const int32_t str_len,
                                                          const char* pattern,
-                                                         const int32_t pat_len) {
+                                                         const int32_t pat_len,
+                                                         char escape_char) {
   int i, j;
   int search_len = str_len - pat_len + 1;
   for (i = 0; i < search_len; ++i) {
@@ -54,10 +57,12 @@ extern "C" RUNTIME_EXPORT DEVICE bool string_like_simple(const char* str,
   return false;
 }
 
+// escape_char does nothing and it is intentional as describe above
 extern "C" RUNTIME_EXPORT DEVICE bool string_ilike_simple(const char* str,
                                                           const int32_t str_len,
                                                           const char* pattern,
-                                                          const int32_t pat_len) {
+                                                          const int32_t pat_len,
+                                                          char escape_char) {
   int i, j;
   int search_len = str_len - pat_len + 1;
   for (i = 0; i < search_len; ++i) {
@@ -75,11 +80,12 @@ extern "C" RUNTIME_EXPORT DEVICE bool string_ilike_simple(const char* str,
                                                                const int32_t lhs_len,    \
                                                                const char* rhs,          \
                                                                const int32_t rhs_len,    \
+                                                               char escape_char,         \
                                                                const int8_t bool_null) { \
     if (!lhs || !rhs) {                                                                  \
       return bool_null;                                                                  \
     }                                                                                    \
-    return base_func(lhs, lhs_len, rhs, rhs_len) ? 1 : 0;                                \
+    return base_func(lhs, lhs_len, rhs, rhs_len, escape_char) ? 1 : 0;                   \
   }
 
 STR_LIKE_SIMPLE_NULLABLE(string_like_simple)
diff --git a/Utils/StringLike.h b/Utils/StringLike.h
index ffa231cf1f..9342d70f5d 100644
--- a/Utils/StringLike.h
+++ b/Utils/StringLike.h
@@ -55,12 +55,14 @@ extern "C" RUNTIME_EXPORT DEVICE bool string_ilike(const char* str,
 extern "C" RUNTIME_EXPORT DEVICE bool string_like_simple(const char* str,
                                                          const int32_t str_len,
                                                          const char* pattern,
-                                                         const int32_t pat_len);
+                                                         const int32_t pat_len,
+                                                         char escape_char);
 
 extern "C" RUNTIME_EXPORT DEVICE bool string_ilike_simple(const char* str,
                                                           const int32_t str_len,
                                                           const char* pattern,
-                                                          const int32_t pat_len);
+                                                          const int32_t pat_len,
+                                                          char escape_char);
 
 extern "C" RUNTIME_EXPORT DEVICE bool string_lt(const char* lhs,
                                                 const int32_t lhs_len,
diff --git a/config/asan.suppressions b/config/asan.suppressions
index f7163e4abe..cefa54f3a3 100644
--- a/config/asan.suppressions
+++ b/config/asan.suppressions
@@ -6,3 +6,7 @@ leak:daal::algorithms::engines::mt2203::interface1::Batch
 leak:daal::algorithms::engines::mt2203::interface1::BatchContainer
 leak:daal::algorithms::interface1::AlgorithmDispatchContainer
 leak:daal::algorithms::interface1::Argument::Argument
+# QE-1008 applies to next 3 lines
+leak:_GLOBAL__sub_I_common.cpp
+leak:_GLOBAL__sub_I_register_serializable.cpp
+leak:oneapi::dal::detail::serializable_registry::register_default_factory
diff --git a/config/valgrind.suppressions b/config/valgrind.suppressions
index cf94314430..5a4d4e2f37 100644
--- a/config/valgrind.suppressions
+++ b/config/valgrind.suppressions
@@ -59,3 +59,14 @@
    ...
    fun:runtime.*
 }
+{
+   # QE-1008
+   onedal_api_global_common
+   Memcheck:Leak
+   match-leak-kinds: definite
+   fun:_Znwm
+   ...
+   fun:_GLOBAL__sub_I_common.cpp
+   fun:__libc_csu_init
+   fun:(below main)
+}
diff --git a/heavy.thrift b/heavy.thrift
index c64704fc45..59f715a4b8 100644
--- a/heavy.thrift
+++ b/heavy.thrift
@@ -251,6 +251,7 @@ struct TCopyParams {
   40: string add_metadata_columns;
   41: bool trim_spaces=true;
   42: bool geo_validate_geometry=false;
+  43: bool raster_drop_if_all_null=false;
 }
 
 struct TCreateParams {
diff --git a/java/calcite/src/main/codegen/config.fmpp b/java/calcite/src/main/codegen/config.fmpp
index 76042cbeb0..2f1772b2a1 100644
--- a/java/calcite/src/main/codegen/config.fmpp
+++ b/java/calcite/src/main/codegen/config.fmpp
@@ -569,8 +569,9 @@ data: {
       ]
 
       # List of methods for parsing custom SQL statements.
-      # Note that DROP commands are handled specially by SqlCustomDrop
-      # and SHOW commands are handled specially by SqlCustomShow because
+      # Note that CREATE commands are handled by SqlCustomCreate, 
+      # DROP commands are handled by SqlCustomDrop and SHOW
+      # commands are handled specially by SqlCustomShow because
       # all of the statements listed here have LOOKAHEAD(2) which is
       # hardcoded in Calcite's Parser.jj source code.
       statementParserMethods: [
@@ -584,6 +585,7 @@ data: {
         "SqlRenameTable(span())"
         "SqlInsertIntoTable(span())"
         "SqlKillQuery(span())"
+        "SqlCustomCreate(span())"
         "SqlCustomDrop(span())"
         "SqlCustomShow(span())"
         "SqlGrant(span())"
@@ -625,18 +627,16 @@ data: {
 
       # List of methods for parsing extensions to "CREATE [OR REPLACE]" calls.
       # Each must accept arguments "(SqlParserPos pos, boolean replace)".
+      # ----------------------------------------------------
+      # No longer used in cases where REPLACE is unsupported.
+      #
+      #  This broke away from the default Calcite implementation because we do
+      #     not allow the optional "OR REPLACE" clause which is allowed by
+      #     Calcite's default implementation of create.
+      #  See: SqlCustomCreate()
+      # ----------------------------------------------------
       createStatementParserMethods: [
-        "SqlCreateDataframe"
-        "SqlCreateDB"
-        "SqlCreateServer"
-        "SqlCreateForeignTable"
-        "SqlCreateUserMapping"
-        "SqlCreateTable"
-        "SqlCreateUser"
-        "SqlCreateView"
         "SqlCreateModel"
-        "SqlCreateRole"
-        "SqlCreatePolicy"
       ]
 
       # List of methods for parsing extensions to "DROP" calls.
diff --git a/java/calcite/src/main/codegen/includes/ddlParser.ftl b/java/calcite/src/main/codegen/includes/ddlParser.ftl
index 784eed5b7f..1e9141b3f8 100644
--- a/java/calcite/src/main/codegen/includes/ddlParser.ftl
+++ b/java/calcite/src/main/codegen/includes/ddlParser.ftl
@@ -569,7 +569,7 @@ boolean TemporaryOpt() :
  *
  * CREATE TABLE [ IF NOT EXISTS ] <table_name> AS <select>
  */
-SqlCreate SqlCreateTable(Span s, boolean replace) :
+SqlDdl SqlCreateTable(Span s) :
 {
     boolean temporary = false;
     boolean ifNotExists = false;
@@ -590,11 +590,50 @@ SqlCreate SqlCreateTable(Span s, boolean replace) :
     )
     [ <WITH> withOptions = OptionsOpt() ]
     {
-        return SqlDdlNodes.createTable(s.end(this), replace, temporary, ifNotExists, id,
+        return SqlDdlNodes.createTable(s.end(this), temporary, ifNotExists, id,
             tableElementList, withOptions, query);
     }
 }
 
+/**
+ * Parses a CREATE statement.
+ *
+ * This broke away from the default Calcite implementation because we do
+ * not allow the the optional "OR REPLACE" clause. 
+ *
+ */
+SqlDdl SqlCustomCreate(Span s) :
+{
+    final SqlDdl create;
+}
+{
+    <CREATE>
+    (
+        LOOKAHEAD(1) create = SqlCreateDB(s)
+        |
+        LOOKAHEAD(1) create = SqlCreateTable(s)
+        |
+        LOOKAHEAD(1) create = SqlCreateView(s)
+        |
+        LOOKAHEAD(1) create = SqlCreateRole(s)
+        |
+        LOOKAHEAD(1) create = SqlCreateDataframe(s)
+        |
+        LOOKAHEAD(1) create = SqlCreatePolicy(s)
+        |
+        LOOKAHEAD(1) create = SqlCreateServer(s)
+        |
+        LOOKAHEAD(1) create = SqlCreateForeignTable(s)
+        |
+        LOOKAHEAD(2) create = SqlCreateUserMapping(s)
+        |
+        LOOKAHEAD(2) create = SqlCreateUser(s)
+    )
+    {
+        return create;
+    }
+}
+
 /**
  * Parses a DROP statement.
  *
@@ -914,7 +953,7 @@ SqlDdl SqlEvaluateModel(Span s) :
  *
  * CREATE VIEW [ IF NOT EXISTS ] <view_name> [(columns)] AS <query>
  */
-SqlCreate SqlCreateView(Span s, boolean replace) :
+SqlDdl SqlCreateView(Span s) :
 {
     final boolean ifNotExists;
     final SqlIdentifier id;
@@ -928,7 +967,7 @@ SqlCreate SqlCreateView(Span s, boolean replace) :
         if (columnList != null && columnList.size() > 0) {
             throw new ParseException("Column list aliases in views are not yet supported.");
         }
-        return SqlDdlNodes.createView(s.end(this), replace, ifNotExists, id, columnList,
+        return SqlDdlNodes.createView(s.end(this), ifNotExists, id, columnList,
             query);
     }
 }
@@ -994,10 +1033,8 @@ SqlDdl SqlDropModel(Span s) :
  * Create a database using the following syntax:
  *
  * CREATE DATABASE ...
- *
- *  "replace" option required by SqlCreate, but unused
  */
-SqlCreate SqlCreateDB(Span s, boolean replace) :
+SqlDdl SqlCreateDB(Span s) :
 {
     final boolean ifNotExists;
     final SqlIdentifier dbName;
@@ -1077,10 +1114,8 @@ SqlDdl SqlAlterDatabase(Span s) :
  * Create a user using the following syntax:
  *
  * CREATE USER ["]<name>["] (<property> = value,...);
- *
- *  "replace" option required by SqlCreate, but unused
  */
-SqlCreate SqlCreateUser(Span s, boolean replace) :
+SqlDdl SqlCreateUser(Span s) :
 {
     final SqlIdentifier userName;
     HeavyDBOptionsMap userOptions = null;
@@ -1255,7 +1290,7 @@ SqlIdentifier HyphenatedCompoundIdentifier() :
  * CREATE ROLE <role_name>
  *
  */
-SqlCreate SqlCreateRole(Span s, boolean replace) :
+SqlDdl SqlCreateRole(Span s) :
 {
     final SqlIdentifier role;
 }
@@ -1513,7 +1548,7 @@ SqlDdl SqlRevokePrivilege(Span s, SqlNodeList privileges) :
  *
  *		CREATE DATAFRAME table '(' base_table_element_commalist ')' FROM STRING opt_with_option_list
  */
-SqlCreate SqlCreateDataframe(Span s, boolean replace) :
+SqlDdl SqlCreateDataframe(Span s) :
 {
     SqlIdentifier name;
     SqlNodeList elementList = null;
@@ -1535,7 +1570,7 @@ SqlCreate SqlCreateDataframe(Span s, boolean replace) :
 /*
  * CREATE POLICY
  */
-SqlCreate SqlCreatePolicy(Span s, boolean replace) :
+SqlDdl SqlCreatePolicy(Span s) :
 {
     SqlIdentifier columnName = null;
     SqlIdentifier granteeName = null;
diff --git a/java/calcite/src/main/codegen/includes/foreignServerParser.ftl b/java/calcite/src/main/codegen/includes/foreignServerParser.ftl
index 16cf95a78a..6569782570 100644
--- a/java/calcite/src/main/codegen/includes/foreignServerParser.ftl
+++ b/java/calcite/src/main/codegen/includes/foreignServerParser.ftl
@@ -21,7 +21,7 @@
  *   FOREIGN DATA WRAPPER <foreign_data_wrapper_name>
  *   WITH ( <option> = <value> [, ... ] )
  */
-SqlCreate SqlCreateServer(Span s, boolean replace) :
+SqlCreate SqlCreateServer(Span s) :
 {
     SqlCreateServer.Builder sqlCreateServerBuilder = new SqlCreateServer.Builder();
     SqlIdentifier sqlIdentifier;
diff --git a/java/calcite/src/main/codegen/includes/foreignTableParser.ftl b/java/calcite/src/main/codegen/includes/foreignTableParser.ftl
index 7b867d0926..2cba74cc2f 100644
--- a/java/calcite/src/main/codegen/includes/foreignTableParser.ftl
+++ b/java/calcite/src/main/codegen/includes/foreignTableParser.ftl
@@ -32,7 +32,7 @@
  * SERVER <server_name>
  * [ WITH ( <option> = <value> [, ... ] ) ]
  */
-SqlCreateForeignTable SqlCreateForeignTable(Span s, boolean replace) :
+SqlCreateForeignTable SqlCreateForeignTable(Span s) :
 {
     boolean ifNotExists = false;
     SqlIdentifier tableName = null;
diff --git a/java/calcite/src/main/codegen/includes/userMappingParser.ftl b/java/calcite/src/main/codegen/includes/userMappingParser.ftl
index b6d995c5d9..812d4d2446 100644
--- a/java/calcite/src/main/codegen/includes/userMappingParser.ftl
+++ b/java/calcite/src/main/codegen/includes/userMappingParser.ftl
@@ -21,7 +21,7 @@
  *   SERVER <server_name>
  *   WITH ( <option> = <value> [, ... ] )
  */
-SqlCreate SqlCreateUserMapping(Span s, boolean replace) :
+SqlCreate SqlCreateUserMapping(Span s) :
 {
     SqlCreateUserMapping.Builder sqlCreateUserMappingBuilder = new SqlCreateUserMapping.Builder();
     final String user;
diff --git a/java/calcite/src/main/java/com/mapd/calcite/parser/HeavyDBSqlOperatorTable.java b/java/calcite/src/main/java/com/mapd/calcite/parser/HeavyDBSqlOperatorTable.java
index 052a8fc93a..180342234a 100644
--- a/java/calcite/src/main/java/com/mapd/calcite/parser/HeavyDBSqlOperatorTable.java
+++ b/java/calcite/src/main/java/com/mapd/calcite/parser/HeavyDBSqlOperatorTable.java
@@ -27,6 +27,7 @@
 import com.mapd.parser.extension.ddl.SqlLeadLag;
 import com.mapd.parser.extension.ddl.SqlNthValueInFrame;
 import com.mapd.parser.server.ExtensionFunction;
+import com.mapd.parser.server.ExtensionFunction.ExtArgumentType;
 
 import org.apache.calcite.linq4j.Ord;
 import org.apache.calcite.rel.metadata.RelColumnMapping;
@@ -231,10 +232,14 @@ public void addUDF(final Map<String, ExtensionFunction> extSigs) {
     addOperator(new RegexpReplace());
     addOperator(new RegexpSubstr());
     addOperator(new RegexpMatch());
+    addOperator(new RegexpCount());
     addOperator(new Base64Encode());
     addOperator(new Base64Decode());
+    addOperator(new UrlEncode());
+    addOperator(new UrlDecode());
     addOperator(new JarowinklerSimilarity());
     addOperator(new LevenshteinDistance());
+    addOperator(new Hash());
     addOperator(new Likely());
     addOperator(new Unlikely());
     addOperator(new Sign());
@@ -1441,6 +1446,65 @@ public RegexpMatch() {
       super("REGEXP_MATCH");
     }
   }
+
+  public static class RegexpCount extends SqlFunction {
+    public RegexpCount() {
+      super("REGEXP_COUNT",
+              SqlKind.OTHER_FUNCTION,
+              null,
+              null,
+              OperandTypes.family(getSignatureFamilies()),
+              SqlFunctionCategory.STRING);
+    }
+
+    private static java.util.List<SqlTypeFamily> getSignatureFamilies() {
+      java.util.ArrayList<SqlTypeFamily> families =
+              new java.util.ArrayList<SqlTypeFamily>();
+      families.add(SqlTypeFamily.STRING);
+      families.add(SqlTypeFamily.STRING);
+      families.add(SqlTypeFamily.INTEGER);
+      families.add(SqlTypeFamily.STRING);
+      return families;
+    }
+
+    @Override
+    public SqlCall createCall(@Nullable SqlLiteral functionQualifier,
+            SqlParserPos pos,
+            @Nullable SqlNode... operands) {
+      assert functionQualifier == null;
+      final int num_operands = operands.length;
+      if (num_operands < 2 || num_operands > 4) {
+        throw new IllegalArgumentException(
+                "Invalid operand count " + Arrays.toString(operands));
+      }
+      SqlNode[] new_operands = new SqlNode[4];
+      // operand string
+      new_operands[0] = operands[0];
+      // pattern
+      new_operands[1] = operands[1];
+      // position
+      if (num_operands < 3 || operands[2] == null) {
+        new_operands[2] = SqlLiteral.createExactNumeric("1", pos);
+      } else {
+        new_operands[2] = operands[2];
+      }
+      // parameters
+      if (num_operands < 4 || operands[3] == null) {
+        new_operands[3] = SqlLiteral.createCharString("c", pos);
+      } else {
+        new_operands[3] = operands[3];
+      }
+      return super.createCall(functionQualifier, pos, new_operands);
+    }
+
+    @Override
+    public RelDataType inferReturnType(SqlOperatorBinding opBinding) {
+      final RelDataTypeFactory typeFactory = opBinding.getTypeFactory();
+      RelDataType dataType = typeFactory.createSqlType(SqlTypeName.BIGINT);
+      return typeFactory.createTypeWithNullability(dataType, true);
+    }
+  }
+
   public static class Base64Encode extends SqlFunction {
     public Base64Encode() {
       super("BASE64_ENCODE",
@@ -1487,6 +1551,52 @@ public RelDataType inferReturnType(SqlOperatorBinding opBinding) {
     }
   }
 
+  public static class UrlEncode extends SqlFunction {
+    public UrlEncode() {
+      super("URL_ENCODE",
+              SqlKind.OTHER_FUNCTION,
+              null,
+              null,
+              OperandTypes.family(getSignatureFamilies()),
+              SqlFunctionCategory.STRING);
+    }
+
+    private static java.util.List<SqlTypeFamily> getSignatureFamilies() {
+      java.util.ArrayList<SqlTypeFamily> families =
+              new java.util.ArrayList<SqlTypeFamily>();
+      families.add(SqlTypeFamily.STRING);
+      return families;
+    }
+
+    @Override
+    public RelDataType inferReturnType(SqlOperatorBinding opBinding) {
+      return opBinding.getOperandType(0);
+    }
+  }
+
+  public static class UrlDecode extends SqlFunction {
+    public UrlDecode() {
+      super("URL_DECODE",
+              SqlKind.OTHER_FUNCTION,
+              null,
+              null,
+              OperandTypes.family(getSignatureFamilies()),
+              SqlFunctionCategory.STRING);
+    }
+
+    private static java.util.List<SqlTypeFamily> getSignatureFamilies() {
+      java.util.ArrayList<SqlTypeFamily> families =
+              new java.util.ArrayList<SqlTypeFamily>();
+      families.add(SqlTypeFamily.STRING);
+      return families;
+    }
+
+    @Override
+    public RelDataType inferReturnType(SqlOperatorBinding opBinding) {
+      return opBinding.getOperandType(0);
+    }
+  }
+
   public static class JarowinklerSimilarity extends SqlFunction {
     public JarowinklerSimilarity() {
       super("JAROWINKLER_SIMILARITY",
@@ -1509,7 +1619,8 @@ private static java.util.List<SqlTypeFamily> getSignatureFamilies() {
     public RelDataType inferReturnType(SqlOperatorBinding opBinding) {
       assert opBinding.getOperandCount() == 2;
       final RelDataTypeFactory typeFactory = opBinding.getTypeFactory();
-      return typeFactory.createSqlType(SqlTypeName.BIGINT);
+      RelDataType dataType = typeFactory.createSqlType(SqlTypeName.BIGINT);
+      return typeFactory.createTypeWithNullability(dataType, true);
     }
   }
 
@@ -1535,7 +1646,8 @@ private static java.util.List<SqlTypeFamily> getSignatureFamilies() {
     public RelDataType inferReturnType(SqlOperatorBinding opBinding) {
       assert opBinding.getOperandCount() == 2;
       final RelDataTypeFactory typeFactory = opBinding.getTypeFactory();
-      return typeFactory.createSqlType(SqlTypeName.BIGINT);
+      RelDataType dataType = typeFactory.createSqlType(SqlTypeName.BIGINT);
+      return typeFactory.createTypeWithNullability(dataType, true);
     }
   }
 
@@ -1636,6 +1748,35 @@ public void unparse(SqlWriter writer, SqlCall call, int leftPrec, int rightPrec)
     }
   }
 
+  public static class Hash extends SqlFunction {
+    public Hash() {
+      super("HASH",
+              SqlKind.OTHER_FUNCTION,
+              null,
+              null,
+              OperandTypes.family(getSignatureFamilies()),
+              SqlFunctionCategory.SYSTEM);
+    }
+
+    private static java.util.List<SqlTypeFamily> getSignatureFamilies() {
+      java.util.ArrayList<SqlTypeFamily> families =
+              new java.util.ArrayList<SqlTypeFamily>();
+      // Todo(todd): Support any input type for HASH function
+      // families.add(SqlTypeFamily.ANY);
+      families.add(SqlTypeFamily.STRING);
+      return families;
+    }
+
+    @Override
+    public RelDataType inferReturnType(SqlOperatorBinding opBinding) {
+      assert opBinding.getOperandCount() == 1;
+      final RelDataTypeFactory typeFactory = opBinding.getTypeFactory();
+      return typeFactory.createTypeWithNullability(
+              typeFactory.createSqlType(SqlTypeName.BIGINT),
+              opBinding.getOperandType(0).isNullable());
+    }
+  }
+
   public static class Likely extends SqlFunction {
     public Likely() {
       super("LIKELY",
diff --git a/java/calcite/src/main/java/com/mapd/parser/extension/ddl/SqlCreateTable.java b/java/calcite/src/main/java/com/mapd/parser/extension/ddl/SqlCreateTable.java
index 3eaf0c0ada..df0817d87b 100644
--- a/java/calcite/src/main/java/com/mapd/parser/extension/ddl/SqlCreateTable.java
+++ b/java/calcite/src/main/java/com/mapd/parser/extension/ddl/SqlCreateTable.java
@@ -55,14 +55,13 @@ public class SqlCreateTable extends SqlCreate {
 
   /** Creates a SqlCreateTable. */
   protected SqlCreateTable(SqlParserPos pos,
-          boolean replace,
           boolean temporary,
           boolean ifNotExists,
           SqlIdentifier name,
           SqlNodeList columnList,
           HeavyDBOptionsMap withOptions,
           SqlNode query) {
-    super(OPERATOR, pos, replace, ifNotExists);
+    super(OPERATOR, pos, false, ifNotExists);
     this.temporary = temporary;
     this.name = Objects.requireNonNull(name);
     this.options = withOptions;
@@ -137,7 +136,6 @@ public String toString() {
       }
     }
     jsonBuilder.put(map, "elements", elements_list);
-
     jsonBuilder.put(map, "temporary", this.temporary);
     jsonBuilder.put(map, "ifNotExists", this.ifNotExists);
 
diff --git a/java/calcite/src/main/java/com/mapd/parser/extension/ddl/SqlCreateView.java b/java/calcite/src/main/java/com/mapd/parser/extension/ddl/SqlCreateView.java
index 2574a42dc6..3c8cef01e6 100644
--- a/java/calcite/src/main/java/com/mapd/parser/extension/ddl/SqlCreateView.java
+++ b/java/calcite/src/main/java/com/mapd/parser/extension/ddl/SqlCreateView.java
@@ -48,12 +48,11 @@ public class SqlCreateView extends SqlCreate {
 
   /** Creates a SqlCreateView. */
   SqlCreateView(SqlParserPos pos,
-          boolean replace,
           boolean ifNotExists,
           SqlIdentifier name,
           SqlNodeList columnList,
           SqlNode query) {
-    super(OPERATOR, pos, replace, ifNotExists);
+    super(OPERATOR, pos, false, ifNotExists);
     this.name = Objects.requireNonNull(name);
     this.columnList = columnList; // may be null
     this.query = Objects.requireNonNull(query);
@@ -65,12 +64,7 @@ public List<SqlNode> getOperandList() {
 
   @Override
   public void unparse(SqlWriter writer, int leftPrec, int rightPrec) {
-    if (getReplace()) {
-      writer.keyword("CREATE OR REPLACE");
-    } else {
-      writer.keyword("CREATE");
-    }
-    writer.keyword("VIEW");
+    writer.keyword("CREATE VIEW");
     name.unparse(writer, leftPrec, rightPrec);
     if (columnList != null) {
       SqlWriter.Frame frame = writer.startList("(", ")");
@@ -101,7 +95,6 @@ public String toString() {
     SqlPrettyWriter writer = new SqlPrettyWriter(c);
     this.query.unparse(writer, 0, 0);
     jsonBuilder.put(map, "query", writer.toString());
-
     jsonBuilder.put(map, "ifNotExists", this.ifNotExists);
 
     map.put("command", "CREATE_VIEW");
diff --git a/java/calcite/src/main/java/com/mapd/parser/extension/ddl/SqlDdlNodes.java b/java/calcite/src/main/java/com/mapd/parser/extension/ddl/SqlDdlNodes.java
index ea484de917..762aa68e0e 100644
--- a/java/calcite/src/main/java/com/mapd/parser/extension/ddl/SqlDdlNodes.java
+++ b/java/calcite/src/main/java/com/mapd/parser/extension/ddl/SqlDdlNodes.java
@@ -36,7 +36,6 @@ private SqlDdlNodes() {}
 
   /** Creates a CREATE TABLE. */
   public static SqlCreateTable createTable(SqlParserPos pos,
-          boolean replace,
           boolean temporary,
           boolean ifNotExists,
           SqlIdentifier name,
@@ -44,17 +43,16 @@ public static SqlCreateTable createTable(SqlParserPos pos,
           HeavyDBOptionsMap withOptions,
           SqlNode query) {
     return new SqlCreateTable(
-            pos, replace, temporary, ifNotExists, name, columnList, withOptions, query);
+            pos, temporary, ifNotExists, name, columnList, withOptions, query);
   }
 
   /** Creates a CREATE VIEW. */
   public static SqlCreateView createView(SqlParserPos pos,
-          boolean replace,
           boolean ifNotExists,
           SqlIdentifier name,
           SqlNodeList columnList,
           SqlNode query) {
-    return new SqlCreateView(pos, replace, ifNotExists, name, columnList, query);
+    return new SqlCreateView(pos, ifNotExists, name, columnList, query);
   }
 
   /** Creates a CREATE MODEL. */
diff --git a/java/calcite/src/main/java/com/mapd/parser/hint/HeavyDBHintStrategyTable.java b/java/calcite/src/main/java/com/mapd/parser/hint/HeavyDBHintStrategyTable.java
index 19c320906a..e35bd26d14 100644
--- a/java/calcite/src/main/java/com/mapd/parser/hint/HeavyDBHintStrategyTable.java
+++ b/java/calcite/src/main/java/com/mapd/parser/hint/HeavyDBHintStrategyTable.java
@@ -41,6 +41,8 @@ static HintStrategyTable createHintStrategies(HintStrategyTable.Builder builder)
     supportedHints.add("force_one_to_many_hash_join");
     supportedHints.add("watchdog_max_projected_rows_per_device");
     supportedHints.add("preflight_count_query_threshold");
+    supportedHints.add("table_reordering_off");
+    supportedHints.add("ndv_groups_estimator_multiplier");
 
     for (String hint_name : supportedHints) {
       // add local / global hints, e.., cpu_mode / g_cpu_mode
diff --git a/java/calcite/src/main/java/org/apache/calcite/rel/externalize/HeavyDBRelJson.java b/java/calcite/src/main/java/org/apache/calcite/rel/externalize/HeavyDBRelJson.java
index 4ffa2a1968..3cb69e84c2 100644
--- a/java/calcite/src/main/java/org/apache/calcite/rel/externalize/HeavyDBRelJson.java
+++ b/java/calcite/src/main/java/org/apache/calcite/rel/externalize/HeavyDBRelJson.java
@@ -275,6 +275,10 @@ Object toJson(Object value) {
     } else if (value instanceof Operation) {
       return value.toString();
     } else {
+      if (value.toString().contains("$cor")) {
+        throw new UnsupportedOperationException(
+                "Unable to decorrelate one of the correlated subqueries.");
+      }
       throw new UnsupportedOperationException("type not serializable: " + value
               + " (type " + value.getClass().getCanonicalName() + ")");
     }
diff --git a/scripts/arch/bison++/PKGBUILD b/scripts/arch/bison++/PKGBUILD
index 9df8df3fad..d9c545d72d 100644
--- a/scripts/arch/bison++/PKGBUILD
+++ b/scripts/arch/bison++/PKGBUILD
@@ -18,7 +18,7 @@ build() {
 package() {
   cd "$srcdir/$pkgname-$pkgver"
 
-  make prefix="$pkgdir/usr/" install
+  make prefix="$pkgdir/usr" install -j 1
 }
 
 # vim:ts=2:sw=2:et:
diff --git a/scripts/build_vulkan_loader_dso.sh b/scripts/build_vulkan_loader_dso.sh
new file mode 100755
index 0000000000..ae6e758ef2
--- /dev/null
+++ b/scripts/build_vulkan_loader_dso.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+#
+# run this inside a suitably-old Ubuntu container
+# docker-internal.mapd.com/cudagl:11.8.0-devel-ubuntu18.04 is a good one
+#
+# e.g
+# docker run -it -v $REPO/scripts:/scripts docker-internal.mapd.com/cudagl:11.8.0-devel-ubuntu18.04 /scripts/build_vulkan_loader_dso.sh
+#
+
+VULKAN_VERSION=1.3.268.0
+DSO_VERSION=1.3.268
+BUILD_TYPE=Release
+BUILD_DIR=/build
+INSTALL_DIR=/scripts/vulkan_loader
+
+# prepare
+mkdir -p ${BUILD_DIR}
+mkdir -p ${INSTALL_DIR}
+cd ${BUILD_DIR}
+
+# install basics
+apt-get -y update
+apt-get -y upgrade
+apt-get -y install git build-essential libx11-xcb-dev libxkbcommon-dev libwayland-dev libxrandr-dev wget python3 libssl-dev
+
+# install or build cmake
+BUILD_CMAKE=false
+if [ "${BUILD_CMAKE}" = "true" ]; then
+	# build locally
+	CMAKE_VERSION=3.25.2
+	wget https://dependencies.mapd.com/thirdparty/cmake-${CMAKE_VERSION}.tar.gz
+	tar xzf cmake-${CMAKE_VERSION}.tar.gz
+	pushd cmake-${CMAKE_VERSION}
+	./configure
+	make -j
+	make install
+	popd
+else
+	# install from KitWare APT repo
+	apt-get -y remove --purge --auto-remove cmake
+	apt-get -y install software-properties-common lsb-release
+	wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null
+	apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main"
+	apt-get -y update
+	apt-get -y install cmake
+fi
+
+# build loader
+wget https://github.com/KhronosGroup/Vulkan-Loader/archive/refs/tags/vulkan-sdk-${VULKAN_VERSION}.tar.gz
+tar xzf vulkan-sdk-${VULKAN_VERSION}.tar.gz
+pushd Vulkan-Loader-vulkan-sdk-${VULKAN_VERSION}
+mkdir build
+pushd build
+cmake -DUPDATE_DEPS=ON -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=install ..
+make -j
+make install
+chmod 755 install/lib/libvulkan.so.${DSO_VERSION}
+cp -f install/lib/libvulkan.so.${DSO_VERSION} ${INSTALL_DIR}
+popd
+popd
diff --git a/scripts/common-functions.sh b/scripts/common-functions.sh
index cdebc2418c..634f906c6b 100755
--- a/scripts/common-functions.sh
+++ b/scripts/common-functions.sh
@@ -15,15 +15,23 @@ if [ "$NOCUDA" = "true" ]; then
 fi
 
 function generate_deps_version_file() {
-  # get the git hash 
-  pushd $SCRIPTS_DIR >/dev/null
-  # For some version of git when run by Jenkins
-  # this command will fail and return an error.  
-  # Better to carry on report what is known.
-  SHORT_HASH=$(git rev-parse --short HEAD || echo "UNKOWN")
-  popd >/dev/null
-  echo "Deps generated for git hash [$SHORT_HASH] and SUFFIX [$SUFFIX]" > $PREFIX/mapd_deps_version.txt
+  # SUFFIX, BRANCH_NAME, GIT_COMMIT and BUILD_CONTAINER_NAME are set as environment variables not as parameters and
+  # are generally set 'on' the calling docker container.
+  echo "Public Release:Deps generated for prefix [$PREFIX], commit [$GIT_COMMIT] and SUFFIX [$SUFFIX]" > $PREFIX/mapd_deps_version.txt
+  # BUILD_CONTAINER_IMAGE will only be set if called from heavyai-dependency-tar-builder.sh
+  if [[ -n $BUILD_CONTAINER_IMAGE_ID ]] ; then
+    echo "Public Release:Using build image id [${BUILD_CONTAINER_IMAGE_ID}]" >> $PREFIX/mapd_deps_version.txt
+  fi
+  if [[ -n $BUILD_CONTAINER_IMAGE ]] ; then
+    # Not copied to released version of this file
+    echo "Using build image [${BUILD_CONTAINER_IMAGE}]" >> $PREFIX/mapd_deps_version.txt
+  fi
+  echo "Component version information:" >> $PREFIX/mapd_deps_version.txt
   # Grab all the _VERSION variables and print them to the file
+  # This isn't a complete list of all software and versions.  For example openssl either uses
+  # the version that ships with the OS or it is installed from the OS specific file and
+  # doesn't use an _VERSION variable.
+  # Not to be copied to released version of this file
   for i in $(compgen -A variable | grep _VERSION) ; do echo  $i "${!i}" ; done >> $PREFIX/mapd_deps_version.txt
 }      
 
@@ -627,6 +635,29 @@ function install_memkind() {
   check_artifact_cleanup v${MEMKIND_VERSION}.tar.gz memkind-${MEMKIND_VERSION}
 }
 
+ABSEIL_VERSION=20230802.1
+
+function install_abseil() {
+  rm -rf abseil
+  mkdir -p abseil
+  pushd abseil
+  wget --continue https://github.com/abseil/abseil-cpp/archive/$ABSEIL_VERSION.tar.gz
+  tar xvf $ABSEIL_VERSION.tar.gz
+  pushd abseil-cpp-$ABSEIL_VERSION
+  mkdir build
+  pushd build
+  cmake \
+      -DCMAKE_INSTALL_PREFIX=$PREFIX \
+      -DABSL_BUILD_TESTING=off \
+      -DABSL_USE_GOOGLETEST_HEAD=off \
+      -DABSL_PROPAGATE_CXX_STD=on \
+      ..
+  make install
+  popd
+  popd
+  popd
+}
+
 VULKAN_VERSION=1.3.239.0 # 1/30/23
 
 function install_vulkan() {
diff --git a/scripts/ee/start_heavyiq.sh b/scripts/ee/start_heavyiq.sh
index 3836b24402..365b6d5e3c 100755
--- a/scripts/ee/start_heavyiq.sh
+++ b/scripts/ee/start_heavyiq.sh
@@ -22,8 +22,9 @@ if [[ "$HAS_PYTHON_3_10" != "false" ]]; then
     fi
     source .venv/bin/activate
 
+    echo "Installing HeavyIQ dependencies."
     PIP_ERROR=false
-    pip install -r requirements.txt &>> $BUILD_LOG_FILE || PIP_ERROR=true
+    pip install -r requirements.txt -r requirements-linux.txt &>> $BUILD_LOG_FILE || PIP_ERROR=true
     if $PIP_ERROR; then
       echo "Warning: An error occurred when installing HeavyIQ dependencies." \
            "See the $BUILD_LOG_FILE logs for more details."
diff --git a/scripts/heavyai-dependency-tar-builder.sh b/scripts/heavyai-dependency-tar-builder.sh
index 62a0414fa2..32c31e06ae 100755
--- a/scripts/heavyai-dependency-tar-builder.sh
+++ b/scripts/heavyai-dependency-tar-builder.sh
@@ -105,15 +105,25 @@ sudo docker pull $BUILD_CONTAINER_IMAGE
 
 # strip verion number from os
 OPERATING_SYSTEM=$(echo $OPERATING_SYSTEM | sed 's/[0-9,\.]*$//')
+#
+# Note we use two methods to pass run information to the docker container.
+# Firstly via options on the command the docker container runs - 'docker_cmd'
+# and secondly via environment varibles on the docker command itself (-e options)
+# The value set in the environment, with the -e options are intended for use by the
+# common-functions.sh script sourced by the 'main' mapd-deps-${OPERATING_SYSTEM}
+# script.
+#
 if [[ $OPERATING_SYSTEM == "centos" ]] ; then
  docker_cmd="yum install sudo -y && ./mapd-deps-${OPERATING_SYSTEM}.sh --savespace --compress $TSAN_PARAM --cache=/dep_cache"
 else
-  docker_cmd='echo -e "#!/bin/sh\n\${@}" > /usr/sbin/sudo && chmod +x /usr/sbin/sudo && ./mapd-deps-'${OPERATING_SYSTEM}'.sh --savespace --compress'
+  docker_cmd='echo -e "#!/bin/sh\n\${@}" > /usr/sbin/sudo && chmod +x /usr/sbin/sudo && ./mapd-deps-'${OPERATING_SYSTEM}'.sh --savespace --compress --cache=/dep_cache'
 fi
 PACKAGE_CACHE=/theHoard/export/dep_cache
 
 echo "Running [$docker_cmd] in $BUILD_CONTAINER_IMAGE"
-
+BUILD_CONTAINER_IMAGE_ID=$(docker images -q $BUILD_CONTAINER_IMAGE --no-trunc)
+# Note - to log the container image name pass it 
+# in as an environmemt.
 sudo docker run --rm --runtime=nvidia \
   -v $BUILD_TMP_DIR:/build \
   -v $PACKAGE_CACHE:/dep_cache \
@@ -121,6 +131,10 @@ sudo docker run --rm --runtime=nvidia \
   -e USER=root \
   --memory=64G --cpuset-cpus=$CPU_SET \
   -e SUFFIX=${SUFFIX} \
+  -e BUILD_CONTAINER_IMAGE_ID=${BUILD_CONTAINER_IMAGE_ID} \
+  -e BUILD_CONTAINER_IMAGE=${BUILD_CONTAINER_IMAGE} \
+  -e BRANCH_NAME=${BRANCH_NAME} \
+  -e GIT_COMMIT=${GIT_COMMIT} \
   --name $BUILD_CONTAINER_NAME \
   $BUILD_CONTAINER_IMAGE \
   bash -c "$docker_cmd"
diff --git a/scripts/mapd-deps-arch.sh b/scripts/mapd-deps-arch.sh
index 5433cb2e30..b4ec2ced94 100755
--- a/scripts/mapd-deps-arch.sh
+++ b/scripts/mapd-deps-arch.sh
@@ -52,6 +52,7 @@ yay -Suy \
     glm \
     glslang \
     go \
+    hdf5 \
     intel-tbb \
     jdk-openjdk \
     libiodbc \
@@ -66,6 +67,7 @@ yay -Suy \
     spirv-cross \
     thrift \
     vulkan-headers \
+    vulkan-utility-libraries \
     wget \
     zlib
 
diff --git a/scripts/mapd-deps-centos.sh b/scripts/mapd-deps-centos.sh
index 73dbc91d38..39cd234d10 100755
--- a/scripts/mapd-deps-centos.sh
+++ b/scripts/mapd-deps-centos.sh
@@ -211,6 +211,9 @@ install_awscpp -j $(nproc)
 # Apache Arrow (see common-functions.sh)
 install_arrow
 
+# abseil
+install_abseil
+
 # glslang (with spirv-tools)
 VERS=11.6.0 # stable 8/25/21
 rm -rf glslang
diff --git a/scripts/mapd-deps-ubuntu.sh b/scripts/mapd-deps-ubuntu.sh
index 3927b00270..a23551a1ac 100755
--- a/scripts/mapd-deps-ubuntu.sh
+++ b/scripts/mapd-deps-ubuntu.sh
@@ -215,6 +215,9 @@ install_go
 # librdkafka
 install_rdkafka
 
+# abseil
+install_abseil
+
 # glslang (with spirv-tools)
 VERS=11.6.0 # stable 8/25/21
 rm -rf glslang