Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Commit

Permalink
Added support for required dict utf8 read. (#419)
Browse files Browse the repository at this point in the history
  • Loading branch information
jorgecarleitao committed Sep 17, 2021
1 parent 4701f10 commit aee8124
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 0 deletions.
42 changes: 42 additions & 0 deletions src/io/parquet/read/binary/basic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,38 @@ fn read_dict_buffer<O: Offset>(
}
}

#[allow(clippy::too_many_arguments)]
fn read_dict_required<O: Offset>(
indices_buffer: &[u8],
additional: usize,
dict: &BinaryPageDict,
offsets: &mut MutableBuffer<O>,
values: &mut MutableBuffer<u8>,
validity: &mut MutableBitmap,
) {
let dict_values = dict.values();
let dict_offsets = dict.offsets();
let mut last_offset = *offsets.as_mut_slice().last().unwrap();

// SPEC: Data page format: the bit width used to encode the entry ids stored as 1 byte (max bit width = 32),
// SPEC: followed by the values encoded using RLE/Bit packed described above (with the given bit width).
let bit_width = indices_buffer[0];
let indices_buffer = &indices_buffer[1..];

let indices = hybrid_rle::HybridRleDecoder::new(indices_buffer, bit_width as u32, additional);

for index in indices {
let index = index as usize;
let dict_offset_i = dict_offsets[index] as usize;
let dict_offset_ip1 = dict_offsets[index + 1] as usize;
let length = dict_offset_ip1 - dict_offset_i;
last_offset += O::from_usize(length).unwrap();
offsets.push(last_offset);
values.extend_from_slice(&dict_values[dict_offset_i..dict_offset_ip1]);
}
validity.extend_constant(additional, true);
}

fn read_delta_optional<O: Offset>(
validity_buffer: &[u8],
values_buffer: &[u8],
Expand Down Expand Up @@ -226,6 +258,16 @@ fn extend_from_page<O: Offset>(
validity,
)
}
(Encoding::PlainDictionary | Encoding::RleDictionary, Some(dict), false) => {
read_dict_required::<O>(
values_buffer,
additional,
dict.as_any().downcast_ref().unwrap(),
offsets,
values,
validity,
)
}
(Encoding::DeltaLengthByteArray, None, true) => read_delta_optional::<O>(
validity_buffer,
values_buffer,
Expand Down
10 changes: 10 additions & 0 deletions tests/it/io/parquet/read.rs
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,16 @@ fn v1_utf8_nullable_dict() -> Result<()> {
test_pyarrow_integration(2, 1, "basic", true, false)
}

#[test]
fn v2_utf8_required_dict() -> Result<()> {
test_pyarrow_integration(2, 2, "basic", true, true)
}

#[test]
fn v1_utf8_required_dict() -> Result<()> {
test_pyarrow_integration(2, 1, "basic", true, true)
}

#[test]
fn v2_boolean_nullable() -> Result<()> {
test_pyarrow_integration(3, 2, "basic", false, false)
Expand Down

0 comments on commit aee8124

Please sign in to comment.