Skip to content

Commit

Permalink
apacheGH-34673: [C++][Parquet] Add Boolean Encoding benchmark for par…
Browse files Browse the repository at this point in the history
…quet (apache#34676)

### Rationale for this change

Add boolean encoding benchmark for parquet.

### What changes are included in this PR?

Some benchmarks

### Are these changes tested?

No, they're benchmark

### Are there any user-facing changes?

No

* Closes: apache#34673

Authored-by: mwish <maplewish117@gmail.com>
Signed-off-by: Rok Mihevc <rok@mihevc.org>
  • Loading branch information
mapleFU committed Mar 24, 2023
1 parent bf424e5 commit 0977cd2
Show file tree
Hide file tree
Showing 2 changed files with 112 additions and 22 deletions.
8 changes: 6 additions & 2 deletions cpp/src/parquet/encoding.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2975,8 +2975,12 @@ class RleBooleanDecoder : public DecoderImpl, virtual public BooleanDecoder {
}

auto decoder_data = data + 4;
decoder_ = std::make_shared<::arrow::util::RleDecoder>(decoder_data, num_bytes,
/*bit_width=*/1);
if (decoder_ == nullptr) {
decoder_ = std::make_shared<::arrow::util::RleDecoder>(decoder_data, num_bytes,
/*bit_width=*/1);
} else {
decoder_->Reset(decoder_data, num_bytes, /*bit_width=*/1);
}
}

int Decode(bool* buffer, int max_values) override {
Expand Down
126 changes: 106 additions & 20 deletions cpp/src/parquet/encoding_benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "arrow/testing/util.h"
#include "arrow/type.h"
#include "arrow/util/byte_stream_split.h"
#include "arrow/visit_data_inline.h"

#include "parquet/encoding.h"
#include "parquet/platform.h"
Expand Down Expand Up @@ -215,7 +216,7 @@ struct BM_SpacedEncodingTraits<BooleanType> {
using CType = bool;
};

static void BM_PlainSpacedArgs(benchmark::internal::Benchmark* bench) {
static void BM_SpacedArgs(benchmark::internal::Benchmark* bench) {
constexpr auto kPlainSpacedSize = 32 * 1024; // 32k

bench->Args({/*size*/ kPlainSpacedSize, /*null_in_ten_thousand*/ 1});
Expand All @@ -226,7 +227,7 @@ static void BM_PlainSpacedArgs(benchmark::internal::Benchmark* bench) {
}

template <typename ParquetType>
static void BM_PlainEncodingSpaced(benchmark::State& state) {
static void BM_EncodingSpaced(benchmark::State& state, Encoding::type encoding) {
using ArrowType = typename BM_SpacedEncodingTraits<ParquetType>::ArrowType;
using ArrayType = typename BM_SpacedEncodingTraits<ParquetType>::ArrayType;
using CType = typename BM_SpacedEncodingTraits<ParquetType>::CType;
Expand All @@ -245,7 +246,7 @@ static void BM_PlainEncodingSpaced(benchmark::State& state) {
// input to PutSpaced.
const auto src = reinterpret_cast<const CType*>(raw_values);

auto encoder = MakeTypedEncoder<ParquetType>(Encoding::PLAIN);
auto encoder = MakeTypedEncoder<ParquetType>(encoding);
for (auto _ : state) {
encoder->PutSpaced(src, num_values, valid_bits, 0);
encoder->FlushValues();
Expand All @@ -254,23 +255,59 @@ static void BM_PlainEncodingSpaced(benchmark::State& state) {
state.SetBytesProcessed(state.iterations() * num_values * sizeof(CType));
}

template <>
void BM_EncodingSpaced<BooleanType>(benchmark::State& state, Encoding::type encoding) {
using CType = bool;

const int num_values = static_cast<int>(state.range(0));
const double null_percent = static_cast<double>(state.range(1)) / 10000.0;

auto rand = ::arrow::random::RandomArrayGenerator(1923);
const auto array = rand.Boolean(num_values, 0.5, null_percent);
const auto valid_bits = array->null_bitmap_data();
bool* output = new bool[state.range(0)];
int output_idx = 0;
PARQUET_THROW_NOT_OK(::arrow::VisitArraySpanInline<::arrow::BooleanType>(
*array->data(),
[&](bool value) {
output[output_idx] = value;
++output_idx;
return ::arrow::Status::OK();
},
[]() { return ::arrow::Status::OK(); }));

auto encoder = MakeTypedEncoder<BooleanType>(encoding);
for (auto _ : state) {
encoder->PutSpaced(output, num_values, valid_bits, 0);
encoder->FlushValues();
}
state.counters["null_percent"] = null_percent * 100;
state.SetBytesProcessed(state.iterations() * num_values * sizeof(CType));
delete[] output;
}

template <typename ParquetType>
static void BM_PlainEncodingSpaced(benchmark::State& state) {
BM_EncodingSpaced<ParquetType>(state, Encoding::PLAIN);
}

static void BM_PlainEncodingSpacedBoolean(benchmark::State& state) {
BM_PlainEncodingSpaced<BooleanType>(state);
}
BENCHMARK(BM_PlainEncodingSpacedBoolean)->Apply(BM_PlainSpacedArgs);
BENCHMARK(BM_PlainEncodingSpacedBoolean)->Apply(BM_SpacedArgs);

static void BM_PlainEncodingSpacedFloat(benchmark::State& state) {
BM_PlainEncodingSpaced<FloatType>(state);
}
BENCHMARK(BM_PlainEncodingSpacedFloat)->Apply(BM_PlainSpacedArgs);
BENCHMARK(BM_PlainEncodingSpacedFloat)->Apply(BM_SpacedArgs);

static void BM_PlainEncodingSpacedDouble(benchmark::State& state) {
BM_PlainEncodingSpaced<DoubleType>(state);
}
BENCHMARK(BM_PlainEncodingSpacedDouble)->Apply(BM_PlainSpacedArgs);
BENCHMARK(BM_PlainEncodingSpacedDouble)->Apply(BM_SpacedArgs);

template <typename ParquetType>
static void BM_PlainDecodingSpaced(benchmark::State& state) {
static void BM_DecodingSpaced(benchmark::State& state, Encoding::type encoding) {
using ArrowType = typename BM_SpacedEncodingTraits<ParquetType>::ArrowType;
using ArrayType = typename BM_SpacedEncodingTraits<ParquetType>::ArrayType;
using CType = typename BM_SpacedEncodingTraits<ParquetType>::CType;
Expand All @@ -279,22 +316,21 @@ static void BM_PlainDecodingSpaced(benchmark::State& state) {
const auto null_percent = static_cast<double>(state.range(1)) / 10000.0;

auto rand = ::arrow::random::RandomArrayGenerator(1923);
const auto array = rand.Numeric<ArrowType>(num_values, -100, 100, null_percent);
std::shared_ptr<::arrow::Array> array;
if constexpr (std::is_same_v<ParquetType, BooleanType>) {
array = rand.Boolean(num_values, /*true_probability*/ 0.5, null_percent);
} else {
array = rand.Numeric<ArrowType>(num_values, -100, 100, null_percent);
}
const auto valid_bits = array->null_bitmap_data();
const int null_count = static_cast<int>(array->null_count());
const auto array_actual = ::arrow::internal::checked_pointer_cast<ArrayType>(array);
const auto raw_values = array_actual->raw_values();
// Guarantee the type cast between raw_values and input of PutSpaced.
static_assert(sizeof(CType) == sizeof(*raw_values), "Type mismatch");
// Cast only happens for BooleanType as it use UInt8 for the array data to match a bool*
// input to PutSpaced.
const auto src = reinterpret_cast<const CType*>(raw_values);

auto encoder = MakeTypedEncoder<ParquetType>(Encoding::PLAIN);
encoder->PutSpaced(src, num_values, valid_bits, 0);
auto encoder = MakeTypedEncoder<ParquetType>(encoding);
encoder->Put(*array);
std::shared_ptr<Buffer> buf = encoder->FlushValues();

auto decoder = MakeTypedDecoder<ParquetType>(Encoding::PLAIN);
auto decoder = MakeTypedDecoder<ParquetType>(encoding);
std::vector<uint8_t> decode_values(num_values * sizeof(CType));
auto decode_buf = reinterpret_cast<CType*>(decode_values.data());
for (auto _ : state) {
Expand All @@ -305,20 +341,25 @@ static void BM_PlainDecodingSpaced(benchmark::State& state) {
state.SetBytesProcessed(state.iterations() * num_values * sizeof(CType));
}

template <typename ParquetType>
static void BM_PlainDecodingSpaced(benchmark::State& state) {
BM_DecodingSpaced<ParquetType>(state, Encoding::PLAIN);
}

static void BM_PlainDecodingSpacedBoolean(benchmark::State& state) {
BM_PlainDecodingSpaced<BooleanType>(state);
}
BENCHMARK(BM_PlainDecodingSpacedBoolean)->Apply(BM_PlainSpacedArgs);
BENCHMARK(BM_PlainDecodingSpacedBoolean)->Apply(BM_SpacedArgs);

static void BM_PlainDecodingSpacedFloat(benchmark::State& state) {
BM_PlainDecodingSpaced<FloatType>(state);
}
BENCHMARK(BM_PlainDecodingSpacedFloat)->Apply(BM_PlainSpacedArgs);
BENCHMARK(BM_PlainDecodingSpacedFloat)->Apply(BM_SpacedArgs);

static void BM_PlainDecodingSpacedDouble(benchmark::State& state) {
BM_PlainDecodingSpaced<DoubleType>(state);
}
BENCHMARK(BM_PlainDecodingSpacedDouble)->Apply(BM_PlainSpacedArgs);
BENCHMARK(BM_PlainDecodingSpacedDouble)->Apply(BM_SpacedArgs);

template <typename T, typename DecodeFunc>
static void BM_ByteStreamSplitDecode(benchmark::State& state, DecodeFunc&& decode_func) {
Expand Down Expand Up @@ -696,6 +737,51 @@ static void BM_DeltaLengthDecodingSpacedByteArray(benchmark::State& state) {
BENCHMARK(BM_PlainDecodingSpacedByteArray)->Apply(ByteArrayCustomArguments);
BENCHMARK(BM_DeltaLengthDecodingSpacedByteArray)->Apply(ByteArrayCustomArguments);

static void BM_RleEncodingBoolean(benchmark::State& state) {
std::vector<bool> values(state.range(0), true);
auto encoder = MakeEncoder(Type::BOOLEAN, Encoding::RLE);
auto typed_encoder = dynamic_cast<BooleanEncoder*>(encoder.get());

for (auto _ : state) {
typed_encoder->Put(values, static_cast<int>(values.size()));
typed_encoder->FlushValues();
}
state.SetBytesProcessed(state.iterations() * state.range(0) * sizeof(bool));
}

BENCHMARK(BM_RleEncodingBoolean)->Range(MIN_RANGE, MAX_RANGE);

static void BM_RleDecodingBoolean(benchmark::State& state) {
std::vector<bool> values(state.range(0), true);
bool* output = new bool[state.range(0)];
auto encoder = MakeEncoder(Type::BOOLEAN, Encoding::RLE);
auto typed_encoder = dynamic_cast<BooleanEncoder*>(encoder.get());
typed_encoder->Put(values, static_cast<int>(values.size()));
std::shared_ptr<Buffer> buf = encoder->FlushValues();

auto decoder = MakeTypedDecoder<BooleanType>(Encoding::RLE);
for (auto _ : state) {
decoder->SetData(static_cast<int>(values.size()), buf->data(),
static_cast<int>(buf->size()));
decoder->Decode(output, static_cast<int>(values.size()));
}

state.SetBytesProcessed(state.iterations() * state.range(0) * sizeof(bool));
delete[] output;
}

BENCHMARK(BM_RleDecodingBoolean)->Range(MIN_RANGE, MAX_RANGE);

static void BM_RleEncodingSpacedBoolean(benchmark::State& state) {
BM_EncodingSpaced<BooleanType>(state, Encoding::RLE);
}
BENCHMARK(BM_RleEncodingSpacedBoolean)->Apply(BM_SpacedArgs);

static void BM_RleDecodingSpacedBoolean(benchmark::State& state) {
BM_DecodingSpaced<BooleanType>(state, Encoding::RLE);
}
BENCHMARK(BM_RleDecodingSpacedBoolean)->Apply(BM_SpacedArgs);

template <typename Type>
static void DecodeDict(std::vector<typename Type::c_type>& values,
benchmark::State& state) {
Expand Down

0 comments on commit 0977cd2

Please sign in to comment.