Templatization of fixed-width parquet decoding kernels. (rapidsai#15911)

This PR merges all of the fixed-width parquet decoding kernels into a single templatized kernel that can be selectively instantiated with desired features (dictionary/no-dictionary, nested/non-nested, etc). It also adds support for (non-list) nested columns in this path. So structs do not have to use the much slower general decode kernel any more. A new benchmark was added specific to structs containing only fixed width columns. I added this because the performance improvement is fairly high (+20%) but we don't see it in the normal struct benchmarks because they include (and are dominated by) string decode times. The new benchmark shows: Before this PR: ``` | data_type | io_type | cardinality | run_length | bytes_per_second | peak_memory_usage | encoded_file_size | |-----------|---------------|-------------|------------|------------------|-------------------|-------------------| | STRUCT | DEVICE_BUFFER | 0 | 1 | 21071216823 | 1.047 GiB | 511.675 MiB | | STRUCT | DEVICE_BUFFER | 1000 | 1 | 18974392387 | 821.312 MiB | 128.884 MiB | | STRUCT | DEVICE_BUFFER | 0 | 32 | 20429356824 | 621.787 MiB | 28.141 MiB | | STRUCT | DEVICE_BUFFER | 1000 | 32 | 20572327813 | 598.421 MiB | 16.475 MiB | ``` After this PR: ``` | data_type | io_type | cardinality | run_length | bytes_per_second | peak_memory_usage | encoded_file_size | |-----------|---------------|-------------|------------|------------------|-------------------|-------------------| | STRUCT | DEVICE_BUFFER | 0 | 1 | 25805996399 | 1.047 GiB | 511.675 MiB | | STRUCT | DEVICE_BUFFER | 1000 | 1 | 22422306660 | 821.312 MiB | 128.884 MiB | | STRUCT | DEVICE_BUFFER | 0 | 32 | 24460694014 | 621.787 MiB | 28.141 MiB | | STRUCT | DEVICE_BUFFER | 1000 | 32 | 24674861214 | 598.421 MiB | 16.475 MiB | ``` Split-page decoding for fixed-width types + structs are also going through this new path. New test added. This brings us closer to eliminating the "general" kernel. The only things left that run through it are lists and booleans. This is PR 1 of 2, with the followup moving a lot of code around. At this point, I think it makes sense to start consolidating our files a bit. I also left some breadcrumbs (a few small commented out code blocks) in the core kernel `gpuDecodePageDataGeneric` for the next step of adding list support. They can be removed if people don't like them. Authors: - https://github.com/nvdbaranec Approvers: - Mike Wilson (https://github.com/hyperbolic2346) - Vukasin Milovanovic (https://github.com/vuule) - Muhammad Haseeb (https://github.com/mhaseeb123) URL: rapidsai#15911
lithomas1 · Jun 28, 2024 · a4b951a · a4b951a
1 parent e434fdb
commit a4b951a
Show file tree

Hide file tree

Showing 6 changed files with 703 additions and 459 deletions.
diff --git a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -59,20 +59,18 @@ void parquet_read_common(cudf::size_type num_rows_to_read,
 }
 
 template <data_type DataType>
-void BM_parquet_read_data(nvbench::state& state, nvbench::type_list<nvbench::enum_type<DataType>>)
+void BM_parquet_read_data_common(nvbench::state& state,
+                                 data_profile const& profile,
+                                 nvbench::type_list<nvbench::enum_type<DataType>>)
 {
   auto const d_type      = get_type_or_group(static_cast<int32_t>(DataType));
-  auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
-  auto const run_length  = static_cast<cudf::size_type>(state.get_int64("run_length"));
   auto const source_type = retrieve_io_type_enum(state.get_string("io_type"));
   auto const compression = cudf::io::compression_type::SNAPPY;
   cuio_source_sink_pair source_sink(source_type);
 
   auto const num_rows_written = [&]() {
-    auto const tbl = create_random_table(
-      cycle_dtypes(d_type, num_cols),
-      table_size_bytes{data_size},
-      data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
+    auto const tbl =
+      create_random_table(cycle_dtypes(d_type, num_cols), table_size_bytes{data_size}, profile);
     auto const view = tbl->view();
 
     cudf::io::parquet_writer_options write_opts =
@@ -85,6 +83,32 @@ void BM_parquet_read_data(nvbench::state& state, nvbench::type_list<nvbench::enu
   parquet_read_common(num_rows_written, num_cols, source_sink, state);
 }
 
+template <data_type DataType>
+void BM_parquet_read_data(nvbench::state& state,
+                          nvbench::type_list<nvbench::enum_type<DataType>> type_list)
+{
+  auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+  auto const run_length  = static_cast<cudf::size_type>(state.get_int64("run_length"));
+  BM_parquet_read_data_common<DataType>(
+    state, data_profile_builder().cardinality(cardinality).avg_run_length(run_length), type_list);
+}
+
+template <data_type DataType>
+void BM_parquet_read_fixed_width_struct(nvbench::state& state,
+                                        nvbench::type_list<nvbench::enum_type<DataType>> type_list)
+{
+  auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+  auto const run_length  = static_cast<cudf::size_type>(state.get_int64("run_length"));
+  std::vector<cudf::type_id> s_types{
+    cudf::type_id::INT32, cudf::type_id::FLOAT32, cudf::type_id::INT64};
+  BM_parquet_read_data_common<DataType>(state,
+                                        data_profile_builder()
+                                          .cardinality(cardinality)
+                                          .avg_run_length(run_length)
+                                          .struct_types(s_types),
+                                        type_list);
+}
+
 void BM_parquet_read_io_compression(nvbench::state& state)
 {
   auto const d_type = get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL),
@@ -247,3 +271,13 @@ NVBENCH_BENCH(BM_parquet_read_io_small_mixed)
   .add_int64_axis("cardinality", {0, 1000})
   .add_int64_axis("run_length", {1, 32})
   .add_int64_axis("num_string_cols", {1, 2, 3});
+
+// a benchmark for structs that only contain fixed-width types
+using d_type_list_struct_only = nvbench::enum_type_list<data_type::STRUCT>;
+NVBENCH_BENCH_TYPES(BM_parquet_read_fixed_width_struct, NVBENCH_TYPE_AXES(d_type_list_struct_only))
+  .set_name("parquet_read_fixed_width_struct")
+  .set_type_axes_names({"data_type"})
+  .add_string_axis("io_type", {"DEVICE_BUFFER"})
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {0, 1000})
+  .add_int64_axis("run_length", {1, 32});