Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Commit

Permalink
Support parquet read from dictionary-encoded nonoptional pages
Browse files Browse the repository at this point in the history
  • Loading branch information
mdrach committed Dec 14, 2021
1 parent 280ed1d commit f02bd15
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 0 deletions.
33 changes: 33 additions & 0 deletions src/io/parquet/read/fixed_size_binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,31 @@ pub(crate) fn read_dict_buffer(
}
}

/// Assumptions: No rep levels
pub(crate) fn read_dict_required(
indices_buffer: &[u8],
additional: usize,
size: usize,
dict: &FixedLenByteArrayPageDict,
values: &mut MutableBuffer<u8>,
validity: &mut MutableBitmap,
) {
let dict_values = dict.values();

// SPEC: Data page format: the bit width used to encode the entry ids stored as 1 byte (max bit width = 32),
// SPEC: followed by the values encoded using RLE/Bit packed described above (with the given bit width).
let bit_width = indices_buffer[0];
let indices_buffer = &indices_buffer[1..];

let indices = hybrid_rle::HybridRleDecoder::new(indices_buffer, bit_width as u32, additional);

for index in indices {
let index = index as usize;
values.extend_from_slice(&dict_values[index * size..(index + 1) * size]);
}
validity.extend_constant(additional * size, true);
}

pub(crate) fn read_optional(
validity_buffer: &[u8],
values_buffer: &[u8],
Expand Down Expand Up @@ -217,6 +242,14 @@ pub(crate) fn extend_from_page(
values,
validity,
),
(Encoding::PlainDictionary, Some(dict), false) => read_dict_required(
values_buffer,
additional,
size,
dict.as_any().downcast_ref().unwrap(),
values,
validity,
),
(Encoding::Plain, _, true) => read_optional(
validity_buffer,
values_buffer,
Expand Down
15 changes: 15 additions & 0 deletions tests/it/io/parquet/read.rs
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,11 @@ fn v2_decimal_9_required() -> Result<()> {
test_pyarrow_integration(6, 2, "basic", false, true, None)
}

#[test]
fn v2_decimal_9_required_dict() -> Result<()> {
test_pyarrow_integration(6, 2, "basic", true, true, None)
}

#[test]
fn v2_decimal_18_nullable() -> Result<()> {
test_pyarrow_integration(8, 2, "basic", false, false, None)
Expand All @@ -323,6 +328,11 @@ fn v2_decimal_18_required() -> Result<()> {
test_pyarrow_integration(7, 2, "basic", false, true, None)
}

#[test]
fn v2_decimal_18_required_dict() -> Result<()> {
test_pyarrow_integration(7, 2, "basic", true, true, None)
}

#[test]
fn v2_decimal_26_nullable() -> Result<()> {
test_pyarrow_integration(9, 2, "basic", false, false, None)
Expand All @@ -333,6 +343,11 @@ fn v2_decimal_26_required() -> Result<()> {
test_pyarrow_integration(8, 2, "basic", false, true, None)
}

#[test]
fn v2_decimal_26_required_dict() -> Result<()> {
test_pyarrow_integration(8, 2, "basic", true, true, None)
}

#[test]
fn v1_struct_optional() -> Result<()> {
test_pyarrow_integration(0, 1, "struct", false, false, None)
Expand Down

0 comments on commit f02bd15

Please sign in to comment.