From cd5a4c0706176833fd536ee5ef39587b79707e3a Mon Sep 17 00:00:00 2001 From: Jorge Leitao Date: Fri, 15 Apr 2022 10:01:19 +0100 Subject: [PATCH] Migrate to latest parquet2 (#923) --- Cargo.toml | 4 +- arrow-parquet-integration-testing/src/main.rs | 3 +- benches/write_parquet.rs | 3 +- examples/parquet_write.rs | 3 +- examples/parquet_write_parallel/src/main.rs | 3 +- src/array/fixed_size_binary/mod.rs | 30 +- src/doc/lib.md | 3 +- src/error.rs | 6 + src/io/parquet/mod.rs | 10 +- .../parquet/read/deserialize/binary/basic.rs | 199 ++++++++-- .../parquet/read/deserialize/binary/nested.rs | 64 ++- .../parquet/read/deserialize/binary/utils.rs | 61 +++ .../parquet/read/deserialize/boolean/basic.rs | 100 +++-- .../read/deserialize/boolean/nested.rs | 36 +- src/io/parquet/read/deserialize/dictionary.rs | 134 ++++--- .../deserialize/fixed_size_binary/basic.rs | 128 ++++-- src/io/parquet/read/deserialize/mod.rs | 7 +- .../parquet/read/deserialize/nested_utils.rs | 72 ++-- .../read/deserialize/primitive/basic.rs | 83 +++- .../read/deserialize/primitive/nested.rs | 61 ++- src/io/parquet/read/deserialize/simple.rs | 84 ++-- src/io/parquet/read/deserialize/utils.rs | 366 ++++++++++++------ src/io/parquet/read/indexes/binary.rs | 43 ++ src/io/parquet/read/indexes/boolean.rs | 21 + .../parquet/read/indexes/fixed_len_binary.rs | 58 +++ src/io/parquet/read/indexes/mod.rs | 141 +++++++ src/io/parquet/read/indexes/primitive.rs | 204 ++++++++++ src/io/parquet/read/mod.rs | 30 +- src/io/parquet/read/row_group.rs | 12 +- src/io/parquet/read/schema/convert.rs | 205 +++++----- src/io/parquet/read/schema/mod.rs | 4 - src/io/parquet/read/statistics/primitive.rs | 52 ++- src/io/parquet/write/binary/basic.rs | 20 +- src/io/parquet/write/binary/mod.rs | 1 + src/io/parquet/write/binary/nested.rs | 16 +- src/io/parquet/write/boolean/basic.rs | 13 +- src/io/parquet/write/boolean/nested.rs | 13 +- src/io/parquet/write/dictionary.rs | 97 +++-- src/io/parquet/write/file.rs | 36 +- src/io/parquet/write/fixed_len_bytes.rs | 32 +- src/io/parquet/write/mod.rs | 65 ++-- src/io/parquet/write/primitive/basic.rs | 18 +- src/io/parquet/write/primitive/mod.rs | 1 + src/io/parquet/write/primitive/nested.rs | 15 +- src/io/parquet/write/row_group.rs | 32 +- src/io/parquet/write/schema.rs | 98 ++--- src/io/parquet/write/sink.rs | 28 +- src/io/parquet/write/utf8/basic.rs | 20 +- src/io/parquet/write/utf8/mod.rs | 1 + src/io/parquet/write/utf8/nested.rs | 15 +- src/io/parquet/write/utils.rs | 62 +-- tests/it/io/parquet/mod.rs | 28 +- tests/it/io/parquet/read_indexes.rs | 223 +++++++++++ tests/it/io/parquet/write.rs | 11 +- tests/it/io/parquet/write_async.rs | 6 +- 55 files changed, 2115 insertions(+), 966 deletions(-) create mode 100644 src/io/parquet/read/indexes/binary.rs create mode 100644 src/io/parquet/read/indexes/boolean.rs create mode 100644 src/io/parquet/read/indexes/fixed_len_binary.rs create mode 100644 src/io/parquet/read/indexes/mod.rs create mode 100644 src/io/parquet/read/indexes/primitive.rs create mode 100644 tests/it/io/parquet/read_indexes.rs diff --git a/Cargo.toml b/Cargo.toml index f79de24413e..95a029f1648 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -53,7 +53,7 @@ hex = { version = "^0.4", optional = true } # for IPC compression lz4 = { version = "1.23.1", optional = true } -zstd = { version = "0.10", optional = true } +zstd = { version = "0.11", optional = true } rand = { version = "0.8", optional = true } @@ -68,7 +68,7 @@ futures = { version = "0.3", optional = true } ahash = { version = "0.7", optional = true } # parquet support -parquet2 = { version = "0.10", optional = true, default_features = false, features = ["stream"] } +parquet2 = { version = "0.11", optional = true, default_features = false, features = ["stream"] } # avro support avro-schema = { version = "0.2", optional = true } diff --git a/arrow-parquet-integration-testing/src/main.rs b/arrow-parquet-integration-testing/src/main.rs index 787474cce7d..a9f5c649954 100644 --- a/arrow-parquet-integration-testing/src/main.rs +++ b/arrow-parquet-integration-testing/src/main.rs @@ -196,8 +196,7 @@ fn main() -> Result<()> { writer.start()?; for group in row_groups { - let (group, len) = group?; - writer.write(group, len)?; + writer.write(group?)?; } let _ = writer.end(None)?; diff --git a/benches/write_parquet.rs b/benches/write_parquet.rs index 32b264bfe53..42cf8deec49 100644 --- a/benches/write_parquet.rs +++ b/benches/write_parquet.rs @@ -34,8 +34,7 @@ fn write(array: &dyn Array, encoding: Encoding) -> Result<()> { writer.start()?; for group in row_groups { - let (group, len) = group?; - writer.write(group, len)?; + writer.write(group?)?; } let _ = writer.end(None)?; Ok(()) diff --git a/examples/parquet_write.rs b/examples/parquet_write.rs index df7939563bb..f11e2ec4f29 100644 --- a/examples/parquet_write.rs +++ b/examples/parquet_write.rs @@ -30,8 +30,7 @@ fn write_batch(path: &str, schema: Schema, columns: Chunk>) -> Re writer.start()?; for group in row_groups { - let (group, len) = group?; - writer.write(group, len)?; + writer.write(group?)?; } let _size = writer.end(None)?; Ok(()) diff --git a/examples/parquet_write_parallel/src/main.rs b/examples/parquet_write_parallel/src/main.rs index e997f11ce4b..2af167e6279 100644 --- a/examples/parquet_write_parallel/src/main.rs +++ b/examples/parquet_write_parallel/src/main.rs @@ -99,8 +99,7 @@ fn parallel_write(path: &str, schema: &Schema, batches: &[Chunk]) -> Result<()> // Write the file. writer.start()?; for group in row_groups { - let (group, len) = group?; - writer.write(group, len)?; + writer.write(group?)?; } let _size = writer.end(None)?; diff --git a/src/array/fixed_size_binary/mod.rs b/src/array/fixed_size_binary/mod.rs index 82695c437e7..f57fe544096 100644 --- a/src/array/fixed_size_binary/mod.rs +++ b/src/array/fixed_size_binary/mod.rs @@ -1,4 +1,9 @@ -use crate::{bitmap::Bitmap, buffer::Buffer, datatypes::DataType, error::ArrowError}; +use crate::{ + bitmap::{Bitmap, MutableBitmap}, + buffer::Buffer, + datatypes::DataType, + error::ArrowError, +}; use super::Array; @@ -274,6 +279,29 @@ impl FixedSizeBinaryArray { .unwrap() .into() } + + /// Creates a [`FixedSizeBinaryArray`] from a slice of arrays of bytes + pub fn from_slice>(a: P) -> Self { + let values = a.as_ref().iter().flatten().copied().collect::>(); + Self::new(DataType::FixedSizeBinary(N), values.into(), None) + } + + /// Creates a new [`FixedSizeBinaryArray`] from a slice of optional `[u8]`. + // Note: this can't be `impl From` because Rust does not allow double `AsRef` on it. + pub fn from]>>(slice: P) -> Self { + let values = slice + .as_ref() + .iter() + .copied() + .flat_map(|x| x.unwrap_or([0; N])) + .collect::>(); + let validity = slice + .as_ref() + .iter() + .map(|x| x.is_some()) + .collect::(); + Self::new(DataType::FixedSizeBinary(N), values.into(), validity.into()) + } } pub trait FixedSizeBinaryValues { diff --git a/src/doc/lib.md b/src/doc/lib.md index 9638ff47480..08108b50932 100644 --- a/src/doc/lib.md +++ b/src/doc/lib.md @@ -62,8 +62,7 @@ fn main() -> Result<()> { // Write the file. writer.start()?; for group in row_groups { - let (group, len) = group?; - writer.write(group, len)?; + writer.write(group?)?; } let _ = writer.end(None)?; Ok(()) diff --git a/src/error.rs b/src/error.rs index 22faa164c35..1ee610085a4 100644 --- a/src/error.rs +++ b/src/error.rs @@ -52,6 +52,12 @@ impl From for ArrowError { } } +impl From for ArrowError { + fn from(error: std::string::FromUtf8Error) -> Self { + ArrowError::External("".to_string(), Box::new(error)) + } +} + impl From for ArrowError { fn from(error: simdutf8::basic::Utf8Error) -> Self { ArrowError::External("".to_string(), Box::new(error)) diff --git a/src/io/parquet/mod.rs b/src/io/parquet/mod.rs index ba17c825b6d..5ef1042e988 100644 --- a/src/io/parquet/mod.rs +++ b/src/io/parquet/mod.rs @@ -6,10 +6,10 @@ pub mod write; const ARROW_SCHEMA_META_KEY: &str = "ARROW:schema"; -impl From for ArrowError { - fn from(error: parquet2::error::ParquetError) -> Self { +impl From for ArrowError { + fn from(error: parquet2::error::Error) -> Self { match error { - parquet2::error::ParquetError::FeatureNotActive(_, _) => { + parquet2::error::Error::FeatureNotActive(_, _) => { let message = "Failed to read a compressed parquet file. \ Use the cargo feature \"io_parquet_compression\" to read compressed parquet files." .to_string(); @@ -20,8 +20,8 @@ impl From for ArrowError { } } -impl From for parquet2::error::ParquetError { +impl From for parquet2::error::Error { fn from(error: ArrowError) -> Self { - parquet2::error::ParquetError::General(error.to_string()) + parquet2::error::Error::General(error.to_string()) } } diff --git a/src/io/parquet/read/deserialize/binary/basic.rs b/src/io/parquet/read/deserialize/binary/basic.rs index 32f491d0b37..0ff4cd31f34 100644 --- a/src/io/parquet/read/deserialize/binary/basic.rs +++ b/src/io/parquet/read/deserialize/binary/basic.rs @@ -2,6 +2,7 @@ use std::collections::VecDeque; use std::default::Default; use parquet2::{ + deserialize::SliceFilteredIter, encoding::{hybrid_rle, Encoding}, page::{BinaryPageDict, DataPage}, schema::Repetition, @@ -16,10 +17,11 @@ use crate::{ }; use super::super::utils::{ - extend_from_decoder, next, BinaryIter, DecodedState, MaybeNext, OptionalPageValidity, + extend_from_decoder, get_selected_rows, next, DecodedState, FilteredOptionalPageValidity, + MaybeNext, OptionalPageValidity, }; use super::super::DataPages; -use super::{super::utils, utils::Binary}; +use super::{super::utils, utils::*}; /* fn read_delta_optional( @@ -61,16 +63,79 @@ fn read_delta_optional( #[derive(Debug)] pub(super) struct Required<'a> { - pub values: BinaryIter<'a>, - pub remaining: usize, + pub values: SizedBinaryIter<'a>, } impl<'a> Required<'a> { pub fn new(page: &'a DataPage) -> Self { - Self { - values: BinaryIter::new(page.buffer()), - remaining: page.num_values(), - } + let values = SizedBinaryIter::new(page.buffer(), page.num_values()); + + Self { values } + } + + pub fn len(&self) -> usize { + self.values.size_hint().0 + } +} + +#[derive(Debug)] +pub(super) struct FilteredRequired<'a> { + pub values: SliceFilteredIter>, +} + +impl<'a> FilteredRequired<'a> { + pub fn new(page: &'a DataPage) -> Self { + let values = SizedBinaryIter::new(page.buffer(), page.num_values()); + + let rows = get_selected_rows(page); + let values = SliceFilteredIter::new(values, rows); + + Self { values } + } + + pub fn len(&self) -> usize { + self.values.size_hint().0 + } +} + +#[derive(Debug)] +pub(super) struct RequiredDictionary<'a> { + pub values: hybrid_rle::HybridRleDecoder<'a>, + pub dict: &'a BinaryPageDict, +} + +impl<'a> RequiredDictionary<'a> { + pub fn new(page: &'a DataPage, dict: &'a BinaryPageDict) -> Self { + let values = utils::dict_indices_decoder(page); + + Self { dict, values } + } + + #[inline] + pub fn len(&self) -> usize { + self.values.size_hint().0 + } +} + +#[derive(Debug)] +pub(super) struct FilteredRequiredDictionary<'a> { + pub values: SliceFilteredIter>, + pub dict: &'a BinaryPageDict, +} + +impl<'a> FilteredRequiredDictionary<'a> { + pub fn new(page: &'a DataPage, dict: &'a BinaryPageDict) -> Self { + let values = utils::dict_indices_decoder(page); + + let rows = get_selected_rows(page); + let values = SliceFilteredIter::new(values, rows); + + Self { values, dict } + } + + #[inline] + pub fn len(&self) -> usize { + self.values.size_hint().0 } } @@ -82,8 +147,7 @@ pub(super) struct ValuesDictionary<'a> { impl<'a> ValuesDictionary<'a> { pub fn new(page: &'a DataPage, dict: &'a BinaryPageDict) -> Self { - let (_, _, indices_buffer) = utils::split_buffer(page); - let values = utils::dict_indices_decoder(indices_buffer, page.num_values()); + let values = utils::dict_indices_decoder(page); Self { dict, values } } @@ -97,17 +161,25 @@ impl<'a> ValuesDictionary<'a> { enum State<'a> { Optional(OptionalPageValidity<'a>, BinaryIter<'a>), Required(Required<'a>), - RequiredDictionary(ValuesDictionary<'a>), + RequiredDictionary(RequiredDictionary<'a>), OptionalDictionary(OptionalPageValidity<'a>, ValuesDictionary<'a>), + FilteredRequired(FilteredRequired<'a>), + FilteredOptional(FilteredOptionalPageValidity<'a>, BinaryIter<'a>), + FilteredRequiredDictionary(FilteredRequiredDictionary<'a>), + FilteredOptionalDictionary(FilteredOptionalPageValidity<'a>, ValuesDictionary<'a>), } impl<'a> utils::PageState<'a> for State<'a> { fn len(&self) -> usize { match self { State::Optional(validity, _) => validity.len(), - State::Required(state) => state.remaining, + State::Required(state) => state.len(), State::RequiredDictionary(values) => values.len(), State::OptionalDictionary(optional, _) => optional.len(), + State::FilteredRequired(state) => state.len(), + State::FilteredOptional(validity, _) => validity.len(), + State::FilteredRequiredDictionary(values) => values.len(), + State::FilteredOptionalDictionary(optional, _) => optional.len(), } } } @@ -162,16 +234,22 @@ impl<'a, O: Offset> utils::Decoder<'a> for BinaryDecoder { fn build_state(&self, page: &'a DataPage) -> Result { let is_optional = - page.descriptor().type_().get_basic_info().repetition() == &Repetition::Optional; - - match (page.encoding(), page.dictionary_page(), is_optional) { - (Encoding::PlainDictionary | Encoding::RleDictionary, Some(dict), false) => { - Ok(State::RequiredDictionary(ValuesDictionary::new( + page.descriptor.primitive_type.field_info.repetition == Repetition::Optional; + let is_filtered = page.selected_rows().is_some(); + + match ( + page.encoding(), + page.dictionary_page(), + is_optional, + is_filtered, + ) { + (Encoding::PlainDictionary | Encoding::RleDictionary, Some(dict), false, false) => { + Ok(State::RequiredDictionary(RequiredDictionary::new( page, dict.as_any().downcast_ref().unwrap(), ))) } - (Encoding::PlainDictionary | Encoding::RleDictionary, Some(dict), true) => { + (Encoding::PlainDictionary | Encoding::RleDictionary, Some(dict), true, false) => { let dict = dict.as_any().downcast_ref().unwrap(); Ok(State::OptionalDictionary( @@ -179,21 +257,41 @@ impl<'a, O: Offset> utils::Decoder<'a> for BinaryDecoder { ValuesDictionary::new(page, dict), )) } - (Encoding::Plain, _, true) => { + (Encoding::PlainDictionary | Encoding::RleDictionary, Some(dict), false, true) => { + let dict = dict.as_any().downcast_ref().unwrap(); + + Ok(State::FilteredRequiredDictionary( + FilteredRequiredDictionary::new(page, dict), + )) + } + (Encoding::PlainDictionary | Encoding::RleDictionary, Some(dict), true, true) => { + let dict = dict.as_any().downcast_ref().unwrap(); + + Ok(State::FilteredOptionalDictionary( + FilteredOptionalPageValidity::new(page), + ValuesDictionary::new(page, dict), + )) + } + (Encoding::Plain, _, true, false) => { let (_, _, values) = utils::split_buffer(page); let values = BinaryIter::new(values); Ok(State::Optional(OptionalPageValidity::new(page), values)) } - (Encoding::Plain, _, false) => Ok(State::Required(Required::new(page))), - _ => Err(utils::not_implemented( - &page.encoding(), - is_optional, - false, - "any", - "Binary", - )), + (Encoding::Plain, _, false, false) => Ok(State::Required(Required::new(page))), + (Encoding::Plain, _, false, true) => { + Ok(State::FilteredRequired(FilteredRequired::new(page))) + } + (Encoding::Plain, _, true, true) => { + let (_, _, values) = utils::split_buffer(page); + + Ok(State::FilteredOptional( + FilteredOptionalPageValidity::new(page), + BinaryIter::new(values), + )) + } + _ => Err(utils::not_implemented(page)), } } @@ -220,7 +318,11 @@ impl<'a, O: Offset> utils::Decoder<'a> for BinaryDecoder { page_values, ), State::Required(page) => { - page.remaining = page.remaining.saturating_sub(additional); + for x in page.values.by_ref().take(additional) { + values.push(x) + } + } + State::FilteredRequired(page) => { for x in page.values.by_ref().take(additional) { values.push(x) } @@ -257,6 +359,47 @@ impl<'a, O: Offset> utils::Decoder<'a> for BinaryDecoder { values.push(x) } } + State::FilteredOptional(page_validity, page_values) => { + utils::extend_from_decoder( + validity, + page_validity, + Some(additional), + values, + page_values.by_ref(), + ); + } + State::FilteredRequiredDictionary(page) => { + let dict_values = page.dict.values(); + let dict_offsets = page.dict.offsets(); + let op = move |index: u32| { + let index = index as usize; + let dict_offset_i = dict_offsets[index] as usize; + let dict_offset_ip1 = dict_offsets[index + 1] as usize; + &dict_values[dict_offset_i..dict_offset_ip1] + }; + + for x in page.values.by_ref().map(op).take(additional) { + values.push(x) + } + } + State::FilteredOptionalDictionary(page_validity, page_values) => { + let dict_values = page_values.dict.values(); + let dict_offsets = page_values.dict.offsets(); + + let op = move |index: u32| { + let index = index as usize; + let dict_offset_i = dict_offsets[index] as usize; + let dict_offset_ip1 = dict_offsets[index + 1] as usize; + &dict_values[dict_offset_i..dict_offset_ip1] + }; + utils::extend_from_decoder( + validity, + page_validity, + Some(additional), + values, + &mut page_values.values.by_ref().map(op), + ) + } } } } diff --git a/src/io/parquet/read/deserialize/binary/nested.rs b/src/io/parquet/read/deserialize/binary/nested.rs index 522cc63cf49..a0427d08474 100644 --- a/src/io/parquet/read/deserialize/binary/nested.rs +++ b/src/io/parquet/read/deserialize/binary/nested.rs @@ -10,7 +10,7 @@ use crate::{ use super::super::nested_utils::*; use super::super::utils::MaybeNext; use super::basic::ValuesDictionary; -use super::utils::Binary; +use super::utils::*; use super::{ super::utils, basic::{finish, Required, TraitBinaryArray}, @@ -19,7 +19,7 @@ use super::{ #[allow(clippy::large_enum_variant)] #[derive(Debug)] enum State<'a> { - Optional(Optional<'a>, utils::BinaryIter<'a>), + Optional(Optional<'a>, BinaryIter<'a>), Required(Required<'a>), RequiredDictionary(ValuesDictionary<'a>), OptionalDictionary(Optional<'a>, ValuesDictionary<'a>), @@ -29,7 +29,7 @@ impl<'a> utils::PageState<'a> for State<'a> { fn len(&self) -> usize { match self { State::Optional(validity, _) => validity.len(), - State::Required(state) => state.remaining, + State::Required(state) => state.len(), State::RequiredDictionary(required) => required.len(), State::OptionalDictionary(optional, _) => optional.len(), } @@ -47,35 +47,35 @@ impl<'a, O: Offset> utils::Decoder<'a> for BinaryDecoder { fn build_state(&self, page: &'a DataPage) -> Result { let is_optional = - page.descriptor().type_().get_basic_info().repetition() == &Repetition::Optional; - - match (page.encoding(), page.dictionary_page(), is_optional) { - (Encoding::PlainDictionary | Encoding::RleDictionary, Some(dict), false) => { + page.descriptor.primitive_type.field_info.repetition == Repetition::Optional; + let is_filtered = page.selected_rows().is_some(); + + match ( + page.encoding(), + page.dictionary_page(), + is_optional, + is_filtered, + ) { + (Encoding::PlainDictionary | Encoding::RleDictionary, Some(dict), false, false) => { let dict = dict.as_any().downcast_ref().unwrap(); Ok(State::RequiredDictionary(ValuesDictionary::new(page, dict))) } - (Encoding::PlainDictionary | Encoding::RleDictionary, Some(dict), true) => { + (Encoding::PlainDictionary | Encoding::RleDictionary, Some(dict), true, false) => { let dict = dict.as_any().downcast_ref().unwrap(); Ok(State::OptionalDictionary( Optional::new(page), ValuesDictionary::new(page, dict), )) } - (Encoding::Plain, None, true) => { + (Encoding::Plain, None, true, false) => { let (_, _, values) = utils::split_buffer(page); - let values = utils::BinaryIter::new(values); + let values = BinaryIter::new(values); Ok(State::Optional(Optional::new(page), values)) } - (Encoding::Plain, None, false) => Ok(State::Required(Required::new(page))), - _ => Err(utils::not_implemented( - &page.encoding(), - is_optional, - false, - "any", - "Binary", - )), + (Encoding::Plain, None, false, false) => Ok(State::Required(Required::new(page))), + _ => Err(utils::not_implemented(page)), } } @@ -95,18 +95,12 @@ impl<'a, O: Offset> utils::Decoder<'a> for BinaryDecoder { let (values, validity) = decoded; match state { State::Optional(page_validity, page_values) => { - let max_def = page_validity.max_def(); - read_optional_values( - page_validity.definition_levels.by_ref(), - max_def, - page_values.by_ref(), - values, - validity, - additional, - ) + let items = page_validity.by_ref().take(additional); + let items = Zip::new(items, page_values.by_ref()); + + read_optional_values(items, values, validity) } State::Required(page) => { - page.remaining -= additional; for x in page.values.by_ref().take(additional) { values.push(x) } @@ -126,7 +120,6 @@ impl<'a, O: Offset> utils::Decoder<'a> for BinaryDecoder { } } State::OptionalDictionary(page_validity, page_values) => { - let max_def = page_validity.max_def(); let dict_values = page_values.dict.values(); let dict_offsets = page_values.dict.offsets(); @@ -136,14 +129,11 @@ impl<'a, O: Offset> utils::Decoder<'a> for BinaryDecoder { let dict_offset_ip1 = dict_offsets[index + 1] as usize; &dict_values[dict_offset_i..dict_offset_ip1] }; - read_optional_values( - page_validity.definition_levels.by_ref(), - max_def, - page_values.values.by_ref().map(op), - values, - validity, - additional, - ) + + let items = page_validity.by_ref().take(additional); + let items = Zip::new(items, page_values.values.by_ref().map(op)); + + read_optional_values(items, values, validity) } } } diff --git a/src/io/parquet/read/deserialize/binary/utils.rs b/src/io/parquet/read/deserialize/binary/utils.rs index d967417eb63..370ad6ffae9 100644 --- a/src/io/parquet/read/deserialize/binary/utils.rs +++ b/src/io/parquet/read/deserialize/binary/utils.rs @@ -87,3 +87,64 @@ impl<'a, O: Offset> Pushable<&'a [u8]> for Binary { self.extend_constant(additional) } } + +#[derive(Debug)] +pub struct BinaryIter<'a> { + values: &'a [u8], +} + +impl<'a> BinaryIter<'a> { + pub fn new(values: &'a [u8]) -> Self { + Self { values } + } +} + +impl<'a> Iterator for BinaryIter<'a> { + type Item = &'a [u8]; + + #[inline] + fn next(&mut self) -> Option { + if self.values.is_empty() { + return None; + } + let length = u32::from_le_bytes(self.values[0..4].try_into().unwrap()) as usize; + self.values = &self.values[4..]; + let result = &self.values[..length]; + self.values = &self.values[length..]; + Some(result) + } +} + +#[derive(Debug)] +pub struct SizedBinaryIter<'a> { + iter: BinaryIter<'a>, + remaining: usize, +} + +impl<'a> SizedBinaryIter<'a> { + pub fn new(values: &'a [u8], size: usize) -> Self { + let iter = BinaryIter::new(values); + Self { + iter, + remaining: size, + } + } +} + +impl<'a> Iterator for SizedBinaryIter<'a> { + type Item = &'a [u8]; + + #[inline] + fn next(&mut self) -> Option { + if self.remaining == 0 { + return None; + } else { + self.remaining -= 1 + }; + self.iter.next() + } + + fn size_hint(&self) -> (usize, Option) { + (self.remaining, Some(self.remaining)) + } +} diff --git a/src/io/parquet/read/deserialize/boolean/basic.rs b/src/io/parquet/read/deserialize/boolean/basic.rs index 705b6d0747a..6f6c31b0f2c 100644 --- a/src/io/parquet/read/deserialize/boolean/basic.rs +++ b/src/io/parquet/read/deserialize/boolean/basic.rs @@ -1,6 +1,8 @@ use std::collections::VecDeque; -use parquet2::{encoding::Encoding, page::DataPage, schema::Repetition}; +use parquet2::{ + deserialize::SliceFilteredIter, encoding::Encoding, page::DataPage, schema::Repetition, +}; use crate::{ array::BooleanArray, @@ -11,25 +13,19 @@ use crate::{ use super::super::utils; use super::super::utils::{ - extend_from_decoder, next, split_buffer, DecodedState, Decoder, MaybeNext, OptionalPageValidity, + extend_from_decoder, get_selected_rows, next, split_buffer, DecodedState, Decoder, + FilteredOptionalPageValidity, MaybeNext, OptionalPageValidity, }; use super::super::DataPages; -// The state of an optional DataPage with a boolean physical type #[derive(Debug)] -struct Optional<'a> { - values: BitmapIter<'a>, - validity: OptionalPageValidity<'a>, -} +struct Values<'a>(BitmapIter<'a>); -impl<'a> Optional<'a> { +impl<'a> Values<'a> { pub fn new(page: &'a DataPage) -> Self { - let (_, _, values_buffer) = split_buffer(page); + let (_, _, values) = split_buffer(page); - Self { - values: BitmapIter::new(values_buffer, 0, values_buffer.len() * 8), - validity: OptionalPageValidity::new(page), - } + Self(BitmapIter::new(values, 0, values.len() * 8)) } } @@ -52,18 +48,44 @@ impl<'a> Required<'a> { } } +#[derive(Debug)] +struct FilteredRequired<'a> { + values: SliceFilteredIter>, +} + +impl<'a> FilteredRequired<'a> { + pub fn new(page: &'a DataPage) -> Self { + // todo: replace this by an iterator over slices, for faster deserialization + let values = BitmapIter::new(page.buffer(), 0, page.num_values()); + + let rows = get_selected_rows(page); + let values = SliceFilteredIter::new(values, rows); + + Self { values } + } + + #[inline] + pub fn len(&self) -> usize { + self.values.size_hint().0 + } +} + // The state of a `DataPage` of `Boolean` parquet boolean type #[derive(Debug)] enum State<'a> { - Optional(Optional<'a>), + Optional(OptionalPageValidity<'a>, Values<'a>), Required(Required<'a>), + FilteredRequired(FilteredRequired<'a>), + FilteredOptional(FilteredOptionalPageValidity<'a>, Values<'a>), } impl<'a> State<'a> { pub fn len(&self) -> usize { match self { - State::Optional(page) => page.validity.len(), + State::Optional(validity, _) => validity.len(), State::Required(page) => page.length - page.offset, + State::FilteredRequired(page) => page.len(), + State::FilteredOptional(optional, _) => optional.len(), } } } @@ -89,18 +111,23 @@ impl<'a> Decoder<'a> for BooleanDecoder { fn build_state(&self, page: &'a DataPage) -> Result { let is_optional = - page.descriptor().type_().get_basic_info().repetition() == &Repetition::Optional; - - match (page.encoding(), is_optional) { - (Encoding::Plain, true) => Ok(State::Optional(Optional::new(page))), - (Encoding::Plain, false) => Ok(State::Required(Required::new(page))), - _ => Err(utils::not_implemented( - &page.encoding(), - is_optional, - false, - "any", - "Boolean", + page.descriptor.primitive_type.field_info.repetition == Repetition::Optional; + let is_filtered = page.selected_rows().is_some(); + + match (page.encoding(), is_optional, is_filtered) { + (Encoding::Plain, true, false) => Ok(State::Optional( + OptionalPageValidity::new(page), + Values::new(page), + )), + (Encoding::Plain, false, false) => Ok(State::Required(Required::new(page))), + (Encoding::Plain, true, true) => Ok(State::FilteredOptional( + FilteredOptionalPageValidity::new(page), + Values::new(page), )), + (Encoding::Plain, false, true) => { + Ok(State::FilteredRequired(FilteredRequired::new(page))) + } + _ => Err(utils::not_implemented(page)), } } @@ -119,18 +146,33 @@ impl<'a> Decoder<'a> for BooleanDecoder { ) { let (values, validity) = decoded; match state { - State::Optional(page) => extend_from_decoder( + State::Optional(page_validity, page_values) => extend_from_decoder( validity, - &mut page.validity, + page_validity, Some(remaining), values, - &mut page.values, + &mut page_values.0, ), State::Required(page) => { let remaining = remaining.min(page.length - page.offset); values.extend_from_slice(page.values, page.offset, remaining); page.offset += remaining; } + State::FilteredRequired(page) => { + values.reserve(remaining); + for item in page.values.by_ref().take(remaining) { + values.push(item) + } + } + State::FilteredOptional(page_validity, page_values) => { + utils::extend_from_decoder( + validity, + page_validity, + Some(remaining), + values, + page_values.0.by_ref(), + ); + } } } } diff --git a/src/io/parquet/read/deserialize/boolean/nested.rs b/src/io/parquet/read/deserialize/boolean/nested.rs index 276283f9080..5f30c698a80 100644 --- a/src/io/parquet/read/deserialize/boolean/nested.rs +++ b/src/io/parquet/read/deserialize/boolean/nested.rs @@ -65,23 +65,18 @@ impl<'a> Decoder<'a> for BooleanDecoder { fn build_state(&self, page: &'a DataPage) -> Result { let is_optional = - page.descriptor().type_().get_basic_info().repetition() == &Repetition::Optional; + page.descriptor.primitive_type.field_info.repetition == Repetition::Optional; + let is_filtered = page.selected_rows().is_some(); - match (page.encoding(), is_optional) { - (Encoding::Plain, true) => { + match (page.encoding(), is_optional, is_filtered) { + (Encoding::Plain, true, false) => { let (_, _, values) = utils::split_buffer(page); let values = BitmapIter::new(values, 0, values.len() * 8); Ok(State::Optional(Optional::new(page), values)) } - (Encoding::Plain, false) => Ok(State::Required(Required::new(page))), - _ => Err(utils::not_implemented( - &page.encoding(), - is_optional, - false, - "any", - "Boolean", - )), + (Encoding::Plain, false, false) => Ok(State::Required(Required::new(page))), + _ => Err(utils::not_implemented(page)), } } @@ -96,24 +91,19 @@ impl<'a> Decoder<'a> for BooleanDecoder { &self, state: &mut State, decoded: &mut Self::DecodedState, - required: usize, + additional: usize, ) { let (values, validity) = decoded; match state { State::Optional(page_validity, page_values) => { - let max_def = page_validity.max_def(); - read_optional_values( - page_validity.definition_levels.by_ref(), - max_def, - page_values.by_ref(), - values, - validity, - required, - ) + let items = page_validity.by_ref().take(additional); + let items = Zip::new(items, page_values.by_ref()); + + read_optional_values(items, values, validity) } State::Required(page) => { - values.extend_from_slice(page.values, page.offset, required); - page.offset += required; + values.extend_from_slice(page.values, page.offset, additional); + page.offset += additional; } } } diff --git a/src/io/parquet/read/deserialize/dictionary.rs b/src/io/parquet/read/deserialize/dictionary.rs index 96aa1b8034d..7a2103c96ee 100644 --- a/src/io/parquet/read/deserialize/dictionary.rs +++ b/src/io/parquet/read/deserialize/dictionary.rs @@ -1,6 +1,7 @@ use std::{collections::VecDeque, sync::Arc}; use parquet2::{ + deserialize::SliceFilteredIter, encoding::{hybrid_rle::HybridRleDecoder, Encoding}, page::{DataPage, DictPage}, schema::Repetition, @@ -13,75 +14,59 @@ use crate::{ }; use super::{ - utils::{self, extend_from_decoder, DecodedState, Decoder, MaybeNext, OptionalPageValidity}, + utils::{ + self, dict_indices_decoder, extend_from_decoder, get_selected_rows, DecodedState, Decoder, + FilteredOptionalPageValidity, MaybeNext, OptionalPageValidity, + }, DataPages, }; // The state of a `DataPage` of `Primitive` parquet primitive type #[derive(Debug)] -pub enum State<'a, K> -where - K: DictionaryKey, -{ - Optional(Optional<'a, K>), - Required(Required<'a, K>), +pub enum State<'a> { + Optional(Optional<'a>), + Required(Required<'a>), + FilteredRequired(FilteredRequired<'a>), + FilteredOptional(FilteredOptionalPageValidity<'a>, HybridRleDecoder<'a>), } -#[inline] -fn values_iter1( - indices_buffer: &[u8], - additional: usize, -) -> std::iter::Map K>> -where - K: DictionaryKey, -{ - // SPEC: Data page format: the bit width used to encode the entry ids stored as 1 byte (max bit width = 32), - // SPEC: followed by the values encoded using RLE/Bit packed described above (with the given bit width). - let bit_width = indices_buffer[0]; - let indices_buffer = &indices_buffer[1..]; +#[derive(Debug)] +pub struct Required<'a> { + values: HybridRleDecoder<'a>, +} - let new_indices = HybridRleDecoder::new(indices_buffer, bit_width as u32, additional); - new_indices.map(Box::new(|x| K::from_u32(x).unwrap()) as _) +impl<'a> Required<'a> { + fn new(page: &'a DataPage) -> Self { + let values = dict_indices_decoder(page); + Self { values } + } } #[derive(Debug)] -pub struct Required<'a, K> -where - K: DictionaryKey, -{ - values: std::iter::Map, Box K + 'a>>, +pub struct FilteredRequired<'a> { + values: SliceFilteredIter>, } -impl<'a, K> Required<'a, K> -where - K: DictionaryKey, -{ +impl<'a> FilteredRequired<'a> { fn new(page: &'a DataPage) -> Self { - let (_, _, indices_buffer) = utils::split_buffer(page); + let values = dict_indices_decoder(page); - let values = values_iter1(indices_buffer, page.num_values()); + let rows = get_selected_rows(page); + let values = SliceFilteredIter::new(values, rows); Self { values } } } #[derive(Debug)] -pub struct Optional<'a, K> -where - K: DictionaryKey, -{ - values: std::iter::Map, Box K + 'a>>, +pub struct Optional<'a> { + values: HybridRleDecoder<'a>, validity: OptionalPageValidity<'a>, } -impl<'a, K> Optional<'a, K> -where - K: DictionaryKey, -{ +impl<'a> Optional<'a> { fn new(page: &'a DataPage) -> Self { - let (_, _, indices_buffer) = utils::split_buffer(page); - - let values = values_iter1(indices_buffer, page.num_values()); + let values = dict_indices_decoder(page); Self { values, @@ -90,14 +75,13 @@ where } } -impl<'a, K> utils::PageState<'a> for State<'a, K> -where - K: DictionaryKey, -{ +impl<'a> utils::PageState<'a> for State<'a> { fn len(&self) -> usize { match self { State::Optional(optional) => optional.validity.len(), State::Required(required) => required.values.size_hint().0, + State::FilteredRequired(required) => required.values.size_hint().0, + State::FilteredOptional(validity, _) => validity.len(), } } } @@ -126,27 +110,31 @@ impl<'a, K> utils::Decoder<'a> for PrimitiveDecoder where K: DictionaryKey, { - type State = State<'a, K>; + type State = State<'a>; type DecodedState = (Vec, MutableBitmap); fn build_state(&self, page: &'a DataPage) -> Result { let is_optional = - page.descriptor().type_().get_basic_info().repetition() == &Repetition::Optional; + page.descriptor.primitive_type.field_info.repetition == Repetition::Optional; + let is_filtered = page.selected_rows().is_some(); - match (page.encoding(), is_optional) { - (Encoding::PlainDictionary | Encoding::RleDictionary, false) => { + match (page.encoding(), is_optional, is_filtered) { + (Encoding::PlainDictionary | Encoding::RleDictionary, false, false) => { Ok(State::Required(Required::new(page))) } - (Encoding::PlainDictionary | Encoding::RleDictionary, true) => { + (Encoding::PlainDictionary | Encoding::RleDictionary, true, false) => { Ok(State::Optional(Optional::new(page))) } - _ => Err(utils::not_implemented( - &page.encoding(), - is_optional, - false, - "any", - "Primitive", - )), + (Encoding::PlainDictionary | Encoding::RleDictionary, false, true) => { + Ok(State::FilteredRequired(FilteredRequired::new(page))) + } + (Encoding::PlainDictionary | Encoding::RleDictionary, true, true) => { + Ok(State::FilteredOptional( + FilteredOptionalPageValidity::new(page), + dict_indices_decoder(page), + )) + } + _ => Err(utils::not_implemented(page)), } } @@ -170,10 +158,30 @@ where &mut page.validity, Some(remaining), values, - &mut page.values, + &mut page.values.by_ref().map(|x| K::from_u32(x).unwrap()), ), State::Required(page) => { - values.extend(page.values.by_ref().take(remaining)); + values.extend( + page.values + .by_ref() + .map(|x| K::from_u32(x).unwrap()) + .take(remaining), + ); + } + State::FilteredOptional(page_validity, page_values) => extend_from_decoder( + validity, + page_validity, + Some(remaining), + values, + &mut page_values.by_ref().map(|x| K::from_u32(x).unwrap()), + ), + State::FilteredRequired(page) => { + values.extend( + page.values + .by_ref() + .map(|x| K::from_u32(x).unwrap()) + .take(remaining), + ); } } } @@ -233,7 +241,7 @@ pub(super) fn next_dict< }; // there is a new page => consume the page from the start - let maybe_page = PrimitiveDecoder::default().build_state(page); + let maybe_page = PrimitiveDecoder::::default().build_state(page); let page = match maybe_page { Ok(page) => page, Err(e) => return MaybeNext::Some(Err(e)), diff --git a/src/io/parquet/read/deserialize/fixed_size_binary/basic.rs b/src/io/parquet/read/deserialize/fixed_size_binary/basic.rs index 260fafa7eb3..c4645e595aa 100644 --- a/src/io/parquet/read/deserialize/fixed_size_binary/basic.rs +++ b/src/io/parquet/read/deserialize/fixed_size_binary/basic.rs @@ -1,6 +1,7 @@ use std::collections::VecDeque; use parquet2::{ + deserialize::SliceFilteredIter, encoding::{hybrid_rle, Encoding}, page::{DataPage, FixedLenByteArrayPageDict}, schema::Repetition, @@ -11,8 +12,9 @@ use crate::{ }; use super::super::utils::{ - dict_indices_decoder, extend_from_decoder, next, not_implemented, split_buffer, DecodedState, - Decoder, MaybeNext, OptionalPageValidity, PageState, Pushable, + dict_indices_decoder, extend_from_decoder, get_selected_rows, next, not_implemented, + split_buffer, DecodedState, Decoder, FilteredOptionalPageValidity, MaybeNext, + OptionalPageValidity, PageState, Pushable, }; use super::super::DataPages; use super::utils::FixedSizeBinary; @@ -37,33 +39,59 @@ impl<'a> Optional<'a> { struct Required<'a> { pub values: std::slice::ChunksExact<'a, u8>, - pub remaining: usize, } impl<'a> Required<'a> { fn new(page: &'a DataPage, size: usize) -> Self { - Self { - values: page.buffer().chunks_exact(size), - remaining: page.num_values(), - } + let values = page.buffer(); + assert_eq!(values.len() % size, 0); + let values = values.chunks_exact(size); + Self { values } + } + + #[inline] + pub fn len(&self) -> usize { + self.values.size_hint().0 + } +} + +struct FilteredRequired<'a> { + pub values: SliceFilteredIter>, +} + +impl<'a> FilteredRequired<'a> { + fn new(page: &'a DataPage, size: usize) -> Self { + let values = page.buffer(); + assert_eq!(values.len() % size, 0); + let values = values.chunks_exact(size); + + let rows = get_selected_rows(page); + let values = SliceFilteredIter::new(values, rows); + + Self { values } + } + + #[inline] + pub fn len(&self) -> usize { + self.values.size_hint().0 } } struct RequiredDictionary<'a> { pub values: hybrid_rle::HybridRleDecoder<'a>, - pub remaining: usize, dict: &'a FixedLenByteArrayPageDict, } impl<'a> RequiredDictionary<'a> { fn new(page: &'a DataPage, dict: &'a FixedLenByteArrayPageDict) -> Self { - let values = dict_indices_decoder(page.buffer(), page.num_values()); + let values = dict_indices_decoder(page); - Self { - values, - remaining: page.num_values(), - dict, - } + Self { dict, values } + } + + #[inline] + pub fn len(&self) -> usize { + self.values.size_hint().0 } } @@ -75,9 +103,7 @@ struct OptionalDictionary<'a> { impl<'a> OptionalDictionary<'a> { fn new(page: &'a DataPage, dict: &'a FixedLenByteArrayPageDict) -> Self { - let (_, _, indices_buffer) = split_buffer(page); - - let values = dict_indices_decoder(indices_buffer, page.num_values()); + let values = dict_indices_decoder(page); Self { values, @@ -92,15 +118,22 @@ enum State<'a> { Required(Required<'a>), RequiredDictionary(RequiredDictionary<'a>), OptionalDictionary(OptionalDictionary<'a>), + FilteredRequired(FilteredRequired<'a>), + FilteredOptional( + FilteredOptionalPageValidity<'a>, + std::slice::ChunksExact<'a, u8>, + ), } impl<'a> PageState<'a> for State<'a> { fn len(&self) -> usize { match self { State::Optional(state) => state.validity.len(), - State::Required(state) => state.remaining, - State::RequiredDictionary(state) => state.remaining, + State::Required(state) => state.len(), + State::RequiredDictionary(state) => state.len(), State::OptionalDictionary(state) => state.validity.len(), + State::FilteredRequired(state) => state.len(), + State::FilteredOptional(state, _) => state.len(), } } } @@ -121,30 +154,45 @@ impl<'a> Decoder<'a> for BinaryDecoder { fn build_state(&self, page: &'a DataPage) -> Result { let is_optional = - page.descriptor().type_().get_basic_info().repetition() == &Repetition::Optional; - - match (page.encoding(), page.dictionary_page(), is_optional) { - (Encoding::Plain, None, true) => Ok(State::Optional(Optional::new(page, self.size))), - (Encoding::Plain, None, false) => Ok(State::Required(Required::new(page, self.size))), - (Encoding::PlainDictionary | Encoding::RleDictionary, Some(dict), false) => { + page.descriptor.primitive_type.field_info.repetition == Repetition::Optional; + let is_filtered = page.selected_rows().is_some(); + + match ( + page.encoding(), + page.dictionary_page(), + is_optional, + is_filtered, + ) { + (Encoding::Plain, None, true, false) => { + Ok(State::Optional(Optional::new(page, self.size))) + } + (Encoding::Plain, None, false, false) => { + Ok(State::Required(Required::new(page, self.size))) + } + (Encoding::PlainDictionary | Encoding::RleDictionary, Some(dict), false, false) => { Ok(State::RequiredDictionary(RequiredDictionary::new( page, dict.as_any().downcast_ref().unwrap(), ))) } - (Encoding::PlainDictionary | Encoding::RleDictionary, Some(dict), true) => { + (Encoding::PlainDictionary | Encoding::RleDictionary, Some(dict), true, false) => { Ok(State::OptionalDictionary(OptionalDictionary::new( page, dict.as_any().downcast_ref().unwrap(), ))) } - _ => Err(not_implemented( - &page.encoding(), - is_optional, - false, - "any", - "FixedBinary", + (Encoding::Plain, None, false, true) => Ok(State::FilteredRequired( + FilteredRequired::new(page, self.size), )), + (Encoding::Plain, _, true, true) => { + let (_, _, values) = split_buffer(page); + + Ok(State::FilteredOptional( + FilteredOptionalPageValidity::new(page), + values.chunks_exact(self.size), + )) + } + _ => Err(not_implemented(page)), } } @@ -172,7 +220,11 @@ impl<'a> Decoder<'a> for BinaryDecoder { &mut page.values, ), State::Required(page) => { - page.remaining -= remaining; + for x in page.values.by_ref().take(remaining) { + values.push(x) + } + } + State::FilteredRequired(page) => { for x in page.values.by_ref().take(remaining) { values.push(x) } @@ -201,11 +253,19 @@ impl<'a> Decoder<'a> for BinaryDecoder { &dict_values[index * size..(index + 1) * size] }; - page.remaining -= remaining; for x in page.values.by_ref().map(op).take(remaining) { values.push(x) } } + State::FilteredOptional(page_validity, page_values) => { + extend_from_decoder( + validity, + page_validity, + Some(remaining), + values, + page_values.by_ref(), + ); + } } } } diff --git a/src/io/parquet/read/deserialize/mod.rs b/src/io/parquet/read/deserialize/mod.rs index bb688a12f89..0d5706354c0 100644 --- a/src/io/parquet/read/deserialize/mod.rs +++ b/src/io/parquet/read/deserialize/mod.rs @@ -17,6 +17,7 @@ use crate::{ }; use self::nested_utils::{InitNested, NestedArrayIter, NestedState}; +use parquet2::schema::types::PrimitiveType; use simple::page_iter_to_arrays; use super::*; @@ -27,7 +28,7 @@ pub fn get_page_iterator( reader: R, pages_filter: Option, buffer: Vec, -) -> Result> { +) -> Result> { Ok(_get_page_iterator( column_metadata, reader, @@ -76,7 +77,7 @@ fn create_list( fn columns_to_iter_recursive<'a, I: 'a>( mut columns: Vec, - mut types: Vec<&ParquetType>, + mut types: Vec<&PrimitiveType>, field: Field, mut init: Vec, chunk_size: usize, @@ -238,7 +239,7 @@ fn field_to_init(field: &Field) -> Vec { /// The arrays are guaranteed to be at most of size `chunk_size` and data type `field.data_type`. pub fn column_iter_to_arrays<'a, I: 'a>( columns: Vec, - types: Vec<&ParquetType>, + types: Vec<&PrimitiveType>, field: Field, chunk_size: usize, ) -> Result> diff --git a/src/io/parquet/read/deserialize/nested_utils.rs b/src/io/parquet/read/deserialize/nested_utils.rs index cb1f977cd1f..b74f7c24734 100644 --- a/src/io/parquet/read/deserialize/nested_utils.rs +++ b/src/io/parquet/read/deserialize/nested_utils.rs @@ -7,6 +7,7 @@ use parquet2::{ use crate::{array::Array, bitmap::MutableBitmap, error::Result}; use super::super::DataPages; +pub use super::utils::Zip; use super::utils::{split_buffer, DecodedState, Decoder, MaybeNext, Pushable}; /// trait describing deserialized repetition and definition levels @@ -203,31 +204,19 @@ impl Nested for NestedStruct { } } -pub(super) fn read_optional_values( - def_levels: D, - max_def: u32, - mut new_values: G, - values: &mut P, - validity: &mut MutableBitmap, - mut remaining: usize, -) where - D: Iterator, - G: Iterator, +pub(super) fn read_optional_values(items: D, values: &mut P, validity: &mut MutableBitmap) +where + D: Iterator>, C: Default, P: Pushable, { - for def in def_levels { - if def == max_def { - values.push(new_values.next().unwrap()); + for item in items { + if let Some(item) = item { + values.push(item); validity.push(true); - remaining -= 1; - } else if def == max_def - 1 { - values.push(C::default()); + } else { + values.push_null(); validity.push(false); - remaining -= 1; - } - if remaining == 0 { - break; } } } @@ -283,8 +272,8 @@ impl<'a> NestedPage<'a> { pub fn new(page: &'a DataPage) -> Self { let (rep_levels, def_levels, _) = split_buffer(page); - let max_rep_level = page.descriptor().max_rep_level(); - let max_def_level = page.descriptor().max_def_level(); + let max_rep_level = page.descriptor.max_rep_level; + let max_def_level = page.descriptor.max_def_level; let reps = HybridRleDecoder::new(rep_levels, get_bit_width(max_rep_level), page.num_values()); @@ -440,37 +429,44 @@ fn extend_offsets2<'a>(page: &mut NestedPage<'a>, nested: &mut NestedState, addi } } -// The state of an optional DataPage with a boolean physical type #[derive(Debug)] pub struct Optional<'a> { - pub definition_levels: HybridRleDecoder<'a>, - max_def: u32, + iter: HybridRleDecoder<'a>, + max: u32, +} + +impl<'a> Iterator for Optional<'a> { + type Item = bool; + + #[inline] + fn next(&mut self) -> Option { + self.iter.next().and_then(|x| { + if x == self.max { + Some(true) + } else if x == self.max - 1 { + Some(false) + } else { + self.next() + } + }) + } } impl<'a> Optional<'a> { pub fn new(page: &'a DataPage) -> Self { let (_, def_levels, _) = split_buffer(page); - let max_def = page.descriptor().max_def_level(); + let max_def = page.descriptor.max_def_level; Self { - definition_levels: HybridRleDecoder::new( - def_levels, - get_bit_width(max_def), - page.num_values(), - ), - max_def: max_def as u32, + iter: HybridRleDecoder::new(def_levels, get_bit_width(max_def), page.num_values()), + max: max_def as u32, } } #[inline] pub fn len(&self) -> usize { - self.definition_levels.size_hint().0 - } - - #[inline] - pub fn max_def(&self) -> u32 { - self.max_def + unreachable!(); } } diff --git a/src/io/parquet/read/deserialize/primitive/basic.rs b/src/io/parquet/read/deserialize/primitive/basic.rs index ed6d30419f9..abb766a6968 100644 --- a/src/io/parquet/read/deserialize/primitive/basic.rs +++ b/src/io/parquet/read/deserialize/primitive/basic.rs @@ -1,6 +1,7 @@ use std::collections::VecDeque; use parquet2::{ + deserialize::SliceFilteredIter, encoding::{hybrid_rle, Encoding}, page::{DataPage, PrimitivePageDict}, schema::Repetition, @@ -14,9 +15,33 @@ use crate::{ }; use super::super::utils; -use super::super::utils::OptionalPageValidity; +use super::super::utils::{get_selected_rows, FilteredOptionalPageValidity, OptionalPageValidity}; use super::super::DataPages; +#[derive(Debug)] +struct FilteredRequiredValues<'a> { + values: SliceFilteredIter>, +} + +impl<'a> FilteredRequiredValues<'a> { + pub fn new(page: &'a DataPage) -> Self { + let (_, _, values) = utils::split_buffer(page); + assert_eq!(values.len() % std::mem::size_of::

(), 0); + + let values = values.chunks_exact(std::mem::size_of::

()); + + let rows = get_selected_rows(page); + let values = SliceFilteredIter::new(values, rows); + + Self { values } + } + + #[inline] + pub fn len(&self) -> usize { + self.values.size_hint().0 + } +} + #[derive(Debug)] pub(super) struct Values<'a> { pub values: std::slice::ChunksExact<'a, u8>, @@ -51,8 +76,7 @@ where P: ParquetNativeType, { pub fn new(page: &'a DataPage, dict: &'a PrimitivePageDict

) -> Self { - let (_, _, indices_buffer) = utils::split_buffer(page); - let values = utils::dict_indices_decoder(indices_buffer, page.num_values()); + let values = utils::dict_indices_decoder(page); Self { dict: dict.values(), @@ -76,6 +100,8 @@ where Required(Values<'a>), RequiredDictionary(ValuesDictionary<'a, P>), OptionalDictionary(OptionalPageValidity<'a>, ValuesDictionary<'a, P>), + FilteredRequired(FilteredRequiredValues<'a>), + FilteredOptional(FilteredOptionalPageValidity<'a>, Values<'a>), } impl<'a, P> utils::PageState<'a> for State<'a, P> @@ -88,6 +114,8 @@ where State::Required(values) => values.len(), State::RequiredDictionary(values) => values.len(), State::OptionalDictionary(optional, _) => optional.len(), + State::FilteredRequired(values) => values.len(), + State::FilteredOptional(optional, _) => optional.len(), } } } @@ -137,14 +165,20 @@ where fn build_state(&self, page: &'a DataPage) -> Result { let is_optional = - page.descriptor().type_().get_basic_info().repetition() == &Repetition::Optional; + page.descriptor.primitive_type.field_info.repetition == Repetition::Optional; + let is_filtered = page.selected_rows().is_some(); - match (page.encoding(), page.dictionary_page(), is_optional) { - (Encoding::PlainDictionary | Encoding::RleDictionary, Some(dict), false) => { + match ( + page.encoding(), + page.dictionary_page(), + is_optional, + is_filtered, + ) { + (Encoding::PlainDictionary | Encoding::RleDictionary, Some(dict), false, false) => { let dict = dict.as_any().downcast_ref().unwrap(); Ok(State::RequiredDictionary(ValuesDictionary::new(page, dict))) } - (Encoding::PlainDictionary | Encoding::RleDictionary, Some(dict), true) => { + (Encoding::PlainDictionary | Encoding::RleDictionary, Some(dict), true, false) => { let dict = dict.as_any().downcast_ref().unwrap(); Ok(State::OptionalDictionary( @@ -152,20 +186,21 @@ where ValuesDictionary::new(page, dict), )) } - (Encoding::Plain, _, true) => { + (Encoding::Plain, _, true, false) => { let validity = OptionalPageValidity::new(page); let values = Values::new::

(page); Ok(State::Optional(validity, values)) } - (Encoding::Plain, _, false) => Ok(State::Required(Values::new::

(page))), - _ => Err(utils::not_implemented( - &page.encoding(), - is_optional, - false, - "any", - "Primitive", + (Encoding::Plain, _, false, false) => Ok(State::Required(Values::new::

(page))), + (Encoding::Plain, _, false, true) => Ok(State::FilteredRequired( + FilteredRequiredValues::new::

(page), )), + (Encoding::Plain, _, true, true) => Ok(State::FilteredOptional( + FilteredOptionalPageValidity::new(page), + Values::new::

(page), + )), + _ => Err(utils::not_implemented(page)), } } @@ -214,6 +249,24 @@ where let op1 = |index: u32| page.dict[index as usize]; values.extend(page.values.by_ref().map(op1).map(self.op).take(remaining)); } + State::FilteredRequired(page) => { + values.extend( + page.values + .by_ref() + .map(decode) + .map(self.op) + .take(remaining), + ); + } + State::FilteredOptional(page_validity, page_values) => { + utils::extend_from_decoder( + validity, + page_validity, + Some(remaining), + values, + page_values.values.by_ref().map(decode).map(self.op), + ); + } } } } diff --git a/src/io/parquet/read/deserialize/primitive/nested.rs b/src/io/parquet/read/deserialize/primitive/nested.rs index a16217c9b18..0aff18e2578 100644 --- a/src/io/parquet/read/deserialize/primitive/nested.rs +++ b/src/io/parquet/read/deserialize/primitive/nested.rs @@ -81,31 +81,31 @@ where fn build_state(&self, page: &'a DataPage) -> Result { let is_optional = - page.descriptor().type_().get_basic_info().repetition() == &Repetition::Optional; - - match (page.encoding(), page.dictionary_page(), is_optional) { - (Encoding::PlainDictionary | Encoding::RleDictionary, Some(dict), false) => { + page.descriptor.primitive_type.field_info.repetition == Repetition::Optional; + let is_filtered = page.selected_rows().is_some(); + + match ( + page.encoding(), + page.dictionary_page(), + is_optional, + is_filtered, + ) { + (Encoding::PlainDictionary | Encoding::RleDictionary, Some(dict), false, false) => { let dict = dict.as_any().downcast_ref().unwrap(); Ok(State::RequiredDictionary(ValuesDictionary::new(page, dict))) } - (Encoding::PlainDictionary | Encoding::RleDictionary, Some(dict), true) => { + (Encoding::PlainDictionary | Encoding::RleDictionary, Some(dict), true, false) => { let dict = dict.as_any().downcast_ref().unwrap(); Ok(State::OptionalDictionary( Optional::new(page), ValuesDictionary::new(page, dict), )) } - (Encoding::Plain, _, true) => { + (Encoding::Plain, _, true, false) => { Ok(State::Optional(Optional::new(page), Values::new::

(page))) } - (Encoding::Plain, _, false) => Ok(State::Required(Values::new::

(page))), - _ => Err(utils::not_implemented( - &page.encoding(), - is_optional, - false, - "any", - "Primitive", - )), + (Encoding::Plain, _, false, false) => Ok(State::Required(Values::new::

(page))), + _ => Err(utils::not_implemented(page)), } } @@ -120,20 +120,15 @@ where &self, state: &mut Self::State, decoded: &mut Self::DecodedState, - remaining: usize, + additional: usize, ) { let (values, validity) = decoded; match state { State::Optional(page_validity, page_values) => { - let max_def = page_validity.max_def(); - read_optional_values( - page_validity.definition_levels.by_ref(), - max_def, - page_values.values.by_ref().map(decode).map(self.op), - values, - validity, - remaining, - ) + let items = page_validity.by_ref().take(additional); + let items = Zip::new(items, page_values.values.by_ref().map(decode).map(self.op)); + + read_optional_values(items, values, validity) } State::Required(page) => { values.extend( @@ -141,24 +136,20 @@ where .by_ref() .map(decode) .map(self.op) - .take(remaining), + .take(additional), ); } State::RequiredDictionary(page) => { let op1 = |index: u32| page.dict[index as usize]; - values.extend(page.values.by_ref().map(op1).map(self.op).take(remaining)); + values.extend(page.values.by_ref().map(op1).map(self.op).take(additional)); } State::OptionalDictionary(page_validity, page_values) => { - let max_def = page_validity.max_def(); let op1 = |index: u32| page_values.dict[index as usize]; - read_optional_values( - page_validity.definition_levels.by_ref(), - max_def, - page_values.values.by_ref().map(op1).map(self.op), - values, - validity, - remaining, - ) + + let items = page_validity.by_ref().take(additional); + let items = Zip::new(items, page_values.values.by_ref().map(op1).map(self.op)); + + read_optional_values(items, values, validity) } } } diff --git a/src/io/parquet/read/deserialize/simple.rs b/src/io/parquet/read/deserialize/simple.rs index 9544f16bff1..d3a50b44c90 100644 --- a/src/io/parquet/read/deserialize/simple.rs +++ b/src/io/parquet/read/deserialize/simple.rs @@ -2,7 +2,7 @@ use std::sync::Arc; use parquet2::{ schema::types::{ - LogicalType, ParquetType, PhysicalType, TimeUnit as ParquetTimeUnit, TimestampType, + PhysicalType, PrimitiveLogicalType, PrimitiveType, TimeUnit as ParquetTimeUnit, }, types::int96_to_i64_ns, }; @@ -60,24 +60,14 @@ where /// of [`DataType`] `data_type` and `chunk_size`. pub fn page_iter_to_arrays<'a, I: 'a + DataPages>( pages: I, - type_: &ParquetType, + type_: &PrimitiveType, data_type: DataType, chunk_size: usize, ) -> Result> { use DataType::*; - let (physical_type, logical_type) = if let ParquetType::PrimitiveType { - physical_type, - logical_type, - .. - } = type_ - { - (physical_type, logical_type) - } else { - return Err(ArrowError::InvalidArgumentError( - "page_iter_to_arrays can only be called with a parquet primitive type".into(), - )); - }; + let physical_type = &type_.physical_type; + let logical_type = &type_.logical_type; Ok(match data_type.to_logical_type() { Null => null::iter_to_arrays(pages, data_type, chunk_size), @@ -240,7 +230,7 @@ pub fn page_iter_to_arrays<'a, I: 'a + DataPages>( fn timestamp<'a, I: 'a + DataPages>( pages: I, physical_type: &PhysicalType, - logical_type: &Option, + logical_type: &Option, data_type: DataType, chunk_size: usize, time_unit: TimeUnit, @@ -267,35 +257,41 @@ fn timestamp<'a, I: 'a + DataPages>( let iter = primitive::Iter::new(pages, data_type, chunk_size, |x: i64| x); - let unit = if let Some(LogicalType::TIMESTAMP(TimestampType { unit, .. })) = logical_type { + let unit = if let Some(PrimitiveLogicalType::Timestamp { unit, .. }) = logical_type { unit } else { return Ok(dyn_iter(iden(iter))); }; Ok(match (unit, time_unit) { - (ParquetTimeUnit::MILLIS(_), TimeUnit::Second) => dyn_iter(op(iter, |x| x / 1_000)), - (ParquetTimeUnit::MICROS(_), TimeUnit::Second) => dyn_iter(op(iter, |x| x / 1_000_000)), - (ParquetTimeUnit::NANOS(_), TimeUnit::Second) => dyn_iter(op(iter, |x| x * 1_000_000_000)), + (ParquetTimeUnit::Milliseconds, TimeUnit::Second) => dyn_iter(op(iter, |x| x / 1_000)), + (ParquetTimeUnit::Microseconds, TimeUnit::Second) => dyn_iter(op(iter, |x| x / 1_000_000)), + (ParquetTimeUnit::Nanoseconds, TimeUnit::Second) => { + dyn_iter(op(iter, |x| x / 1_000_000_000)) + } - (ParquetTimeUnit::MILLIS(_), TimeUnit::Millisecond) => dyn_iter(iden(iter)), - (ParquetTimeUnit::MICROS(_), TimeUnit::Millisecond) => dyn_iter(op(iter, |x| x / 1_000)), - (ParquetTimeUnit::NANOS(_), TimeUnit::Millisecond) => dyn_iter(op(iter, |x| x / 1_000_000)), + (ParquetTimeUnit::Milliseconds, TimeUnit::Millisecond) => dyn_iter(iden(iter)), + (ParquetTimeUnit::Microseconds, TimeUnit::Millisecond) => dyn_iter(op(iter, |x| x / 1_000)), + (ParquetTimeUnit::Nanoseconds, TimeUnit::Millisecond) => { + dyn_iter(op(iter, |x| x / 1_000_000)) + } - (ParquetTimeUnit::MILLIS(_), TimeUnit::Microsecond) => dyn_iter(op(iter, |x| x * 1_000)), - (ParquetTimeUnit::MICROS(_), TimeUnit::Microsecond) => dyn_iter(iden(iter)), - (ParquetTimeUnit::NANOS(_), TimeUnit::Microsecond) => dyn_iter(op(iter, |x| x / 1_000)), + (ParquetTimeUnit::Milliseconds, TimeUnit::Microsecond) => dyn_iter(op(iter, |x| x * 1_000)), + (ParquetTimeUnit::Microseconds, TimeUnit::Microsecond) => dyn_iter(iden(iter)), + (ParquetTimeUnit::Nanoseconds, TimeUnit::Microsecond) => dyn_iter(op(iter, |x| x / 1_000)), - (ParquetTimeUnit::MILLIS(_), TimeUnit::Nanosecond) => dyn_iter(op(iter, |x| x * 1_000_000)), - (ParquetTimeUnit::MICROS(_), TimeUnit::Nanosecond) => dyn_iter(op(iter, |x| x * 1_000)), - (ParquetTimeUnit::NANOS(_), TimeUnit::Nanosecond) => dyn_iter(iden(iter)), + (ParquetTimeUnit::Milliseconds, TimeUnit::Nanosecond) => { + dyn_iter(op(iter, |x| x * 1_000_000)) + } + (ParquetTimeUnit::Microseconds, TimeUnit::Nanosecond) => dyn_iter(op(iter, |x| x * 1_000)), + (ParquetTimeUnit::Nanoseconds, TimeUnit::Nanosecond) => dyn_iter(iden(iter)), }) } fn timestamp_dict<'a, K: DictionaryKey, I: 'a + DataPages>( pages: I, physical_type: &PhysicalType, - logical_type: &Option, + logical_type: &Option, data_type: DataType, chunk_size: usize, time_unit: TimeUnit, @@ -315,7 +311,7 @@ fn timestamp_dict<'a, K: DictionaryKey, I: 'a + DataPages>( } }; - let unit = if let Some(LogicalType::TIMESTAMP(TimestampType { unit, .. })) = logical_type { + let unit = if let Some(PrimitiveLogicalType::Timestamp { unit, .. }) = logical_type { unit } else { return Ok(dyn_iter(primitive::DictIter::::new( @@ -327,7 +323,7 @@ fn timestamp_dict<'a, K: DictionaryKey, I: 'a + DataPages>( }; Ok(match (unit, time_unit) { - (ParquetTimeUnit::MILLIS(_), TimeUnit::Second) => { + (ParquetTimeUnit::Milliseconds, TimeUnit::Second) => { dyn_iter(primitive::DictIter::::new( pages, data_type, @@ -335,7 +331,7 @@ fn timestamp_dict<'a, K: DictionaryKey, I: 'a + DataPages>( |x: i64| x / 1_000, )) } - (ParquetTimeUnit::MICROS(_), TimeUnit::Second) => { + (ParquetTimeUnit::Microseconds, TimeUnit::Second) => { dyn_iter(primitive::DictIter::::new( pages, data_type, @@ -343,16 +339,16 @@ fn timestamp_dict<'a, K: DictionaryKey, I: 'a + DataPages>( |x: i64| x / 1_000_000, )) } - (ParquetTimeUnit::NANOS(_), TimeUnit::Second) => { + (ParquetTimeUnit::Nanoseconds, TimeUnit::Second) => { dyn_iter(primitive::DictIter::::new( pages, data_type, chunk_size, - |x: i64| x * 1_000_000_000, + |x: i64| x / 1_000_000_000, )) } - (ParquetTimeUnit::MILLIS(_), TimeUnit::Millisecond) => { + (ParquetTimeUnit::Milliseconds, TimeUnit::Millisecond) => { dyn_iter(primitive::DictIter::::new( pages, data_type, @@ -360,7 +356,7 @@ fn timestamp_dict<'a, K: DictionaryKey, I: 'a + DataPages>( |x: i64| x, )) } - (ParquetTimeUnit::MICROS(_), TimeUnit::Millisecond) => { + (ParquetTimeUnit::Microseconds, TimeUnit::Millisecond) => { dyn_iter(primitive::DictIter::::new( pages, data_type, @@ -368,7 +364,7 @@ fn timestamp_dict<'a, K: DictionaryKey, I: 'a + DataPages>( |x: i64| x / 1_000, )) } - (ParquetTimeUnit::NANOS(_), TimeUnit::Millisecond) => { + (ParquetTimeUnit::Nanoseconds, TimeUnit::Millisecond) => { dyn_iter(primitive::DictIter::::new( pages, data_type, @@ -377,7 +373,7 @@ fn timestamp_dict<'a, K: DictionaryKey, I: 'a + DataPages>( )) } - (ParquetTimeUnit::MILLIS(_), TimeUnit::Microsecond) => { + (ParquetTimeUnit::Milliseconds, TimeUnit::Microsecond) => { dyn_iter(primitive::DictIter::::new( pages, data_type, @@ -385,7 +381,7 @@ fn timestamp_dict<'a, K: DictionaryKey, I: 'a + DataPages>( |x: i64| x * 1_000, )) } - (ParquetTimeUnit::MICROS(_), TimeUnit::Microsecond) => { + (ParquetTimeUnit::Microseconds, TimeUnit::Microsecond) => { dyn_iter(primitive::DictIter::::new( pages, data_type, @@ -393,7 +389,7 @@ fn timestamp_dict<'a, K: DictionaryKey, I: 'a + DataPages>( |x: i64| x, )) } - (ParquetTimeUnit::NANOS(_), TimeUnit::Microsecond) => { + (ParquetTimeUnit::Nanoseconds, TimeUnit::Microsecond) => { dyn_iter(primitive::DictIter::::new( pages, data_type, @@ -402,7 +398,7 @@ fn timestamp_dict<'a, K: DictionaryKey, I: 'a + DataPages>( )) } - (ParquetTimeUnit::MILLIS(_), TimeUnit::Nanosecond) => { + (ParquetTimeUnit::Milliseconds, TimeUnit::Nanosecond) => { dyn_iter(primitive::DictIter::::new( pages, data_type, @@ -410,7 +406,7 @@ fn timestamp_dict<'a, K: DictionaryKey, I: 'a + DataPages>( |x: i64| x * 1_000_000, )) } - (ParquetTimeUnit::MICROS(_), TimeUnit::Nanosecond) => { + (ParquetTimeUnit::Microseconds, TimeUnit::Nanosecond) => { dyn_iter(primitive::DictIter::::new( pages, data_type, @@ -418,7 +414,7 @@ fn timestamp_dict<'a, K: DictionaryKey, I: 'a + DataPages>( |x: i64| x * 1_000, )) } - (ParquetTimeUnit::NANOS(_), TimeUnit::Nanosecond) => { + (ParquetTimeUnit::Nanoseconds, TimeUnit::Nanosecond) => { dyn_iter(primitive::DictIter::::new( pages, data_type, @@ -432,7 +428,7 @@ fn timestamp_dict<'a, K: DictionaryKey, I: 'a + DataPages>( fn dict_read<'a, K: DictionaryKey, I: 'a + DataPages>( iter: I, physical_type: &PhysicalType, - logical_type: &Option, + logical_type: &Option, data_type: DataType, chunk_size: usize, ) -> Result> { diff --git a/src/io/parquet/read/deserialize/utils.rs b/src/io/parquet/read/deserialize/utils.rs index f9e9cfe20be..7bda733684f 100644 --- a/src/io/parquet/read/deserialize/utils.rs +++ b/src/io/parquet/read/deserialize/utils.rs @@ -1,9 +1,12 @@ use std::collections::VecDeque; -use std::convert::TryInto; -use parquet2::encoding::{hybrid_rle, Encoding}; +use parquet2::deserialize::{ + FilteredHybridEncoded, FilteredHybridRleDecoderIter, HybridDecoderBitmapIter, HybridEncoded, +}; +use parquet2::encoding::hybrid_rle; +use parquet2::indexes::Interval; use parquet2::page::{split_buffer as _split_buffer, DataPage}; -use streaming_iterator::{convert, Convert, StreamingIterator}; +use parquet2::schema::Repetition; use crate::bitmap::utils::BitmapIter; use crate::bitmap::MutableBitmap; @@ -11,51 +14,29 @@ use crate::error::ArrowError; use super::super::DataPages; -#[derive(Debug)] -pub struct BinaryIter<'a> { - values: &'a [u8], -} - -impl<'a> BinaryIter<'a> { - pub fn new(values: &'a [u8]) -> Self { - Self { values } - } -} - -impl<'a> Iterator for BinaryIter<'a> { - type Item = &'a [u8]; - - #[inline] - fn next(&mut self) -> Option { - if self.values.is_empty() { - return None; - } - let length = u32::from_le_bytes(self.values[0..4].try_into().unwrap()) as usize; - self.values = &self.values[4..]; - let result = &self.values[..length]; - self.values = &self.values[length..]; - Some(result) - } -} - -pub fn not_implemented( - encoding: &Encoding, - is_optional: bool, - has_dict: bool, - version: &str, - physical_type: &str, -) -> ArrowError { +pub fn not_implemented(page: &DataPage) -> ArrowError { + let is_optional = page.descriptor.primitive_type.field_info.repetition == Repetition::Optional; + let is_filtered = page.selected_rows().is_some(); let required = if is_optional { "optional" } else { "required" }; - let dict = if has_dict { ", dictionary-encoded" } else { "" }; + let is_filtered = if is_filtered { ", index-filtered" } else { "" }; + let dict = if page.dictionary_page().is_some() { + ", dictionary-encoded" + } else { + "" + }; ArrowError::NotYetImplemented(format!( - "Decoding \"{:?}\"-encoded{} {} {} pages is not yet implemented for {}", - encoding, dict, required, version, physical_type + "Decoding {:?} \"{:?}\"-encoded{} {} {} parquet pages", + page.descriptor.primitive_type.physical_type, + page.encoding(), + dict, + required, + is_filtered, )) } #[inline] pub fn split_buffer(page: &DataPage) -> (&[u8], &[u8], &[u8]) { - _split_buffer(page, page.descriptor()) + _split_buffer(page) } /// A private trait representing structs that can receive elements. @@ -111,43 +92,210 @@ impl Pushable for Vec { } } -#[derive(Debug)] +/// The state of a partially deserialized page +pub(super) trait PageValidity<'a> { + fn next_limited(&mut self, limit: usize) -> Option>; +} + +#[derive(Debug, Clone)] +pub struct FilteredOptionalPageValidity<'a> { + iter: FilteredHybridRleDecoderIter<'a>, + current: Option<(FilteredHybridEncoded<'a>, usize)>, +} + +impl<'a> FilteredOptionalPageValidity<'a> { + pub fn new(page: &'a DataPage) -> Self { + let (_, validity, _) = split_buffer(page); + + let iter = hybrid_rle::Decoder::new(validity, 1); + let iter = HybridDecoderBitmapIter::new(iter, page.num_values()); + let selected_rows = get_selected_rows(page); + let iter = FilteredHybridRleDecoderIter::new(iter, selected_rows); + + Self { + iter, + current: None, + } + } + + pub fn len(&self) -> usize { + self.iter.len() + } +} + +pub fn get_selected_rows(page: &DataPage) -> VecDeque { + page.selected_rows() + .unwrap_or(&[Interval::new(0, page.num_values())]) + .iter() + .copied() + .collect() +} + +impl<'a> PageValidity<'a> for FilteredOptionalPageValidity<'a> { + fn next_limited(&mut self, limit: usize) -> Option> { + let (run, own_offset) = if let Some((run, offset)) = self.current { + (run, offset) + } else { + // a new run + let run = self.iter.next()?; // no run -> None + self.current = Some((run, 0)); + return self.next_limited(limit); + }; + + match run { + FilteredHybridEncoded::Bitmap { + values, + offset, + length, + } => { + let run_length = length - own_offset; + + let length = limit.min(run_length); + + if length == run_length { + self.current = None; + } else { + self.current = Some((run, own_offset + length)); + } + + Some(FilteredHybridEncoded::Bitmap { + values, + offset, + length, + }) + } + FilteredHybridEncoded::Repeated { is_set, length } => { + let run_length = length - own_offset; + + let length = limit.min(run_length); + + if length == run_length { + self.current = None; + } else { + self.current = Some((run, own_offset + length)); + } + + Some(FilteredHybridEncoded::Repeated { is_set, length }) + } + FilteredHybridEncoded::Skipped(set) => { + self.current = None; + Some(FilteredHybridEncoded::Skipped(set)) + } + } + } +} + +pub struct Zip { + validity: V, + values: I, +} + +impl Zip { + pub fn new(validity: V, values: I) -> Self { + Self { validity, values } + } +} + +impl, I: Iterator> Iterator for Zip { + type Item = Option; + + #[inline] + fn next(&mut self) -> Option { + self.validity + .next() + .map(|x| if x { self.values.next() } else { None }) + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + self.validity.size_hint() + } +} + +#[derive(Debug, Clone)] pub struct OptionalPageValidity<'a> { - validity: Convert>, - // invariants: - // * run_offset < length - // * consumed < length - run_offset: usize, - consumed: usize, - length: usize, + iter: HybridDecoderBitmapIter<'a>, + current: Option<(HybridEncoded<'a>, usize)>, } impl<'a> OptionalPageValidity<'a> { - #[inline] pub fn new(page: &'a DataPage) -> Self { let (_, validity, _) = split_buffer(page); - let validity = convert(hybrid_rle::Decoder::new(validity, 1)); + let iter = hybrid_rle::Decoder::new(validity, 1); + let iter = HybridDecoderBitmapIter::new(iter, page.num_values()); Self { - validity, - run_offset: 0, - consumed: 0, - length: page.num_values(), + iter, + current: None, } } - #[inline] pub fn len(&self) -> usize { - self.length - self.consumed + self.iter.len() + + self + .current + .as_ref() + .map(|(run, offset)| run.len() - offset) + .unwrap_or_default() + } + + fn next_limited(&mut self, limit: usize) -> Option> { + let (run, offset) = if let Some((run, offset)) = self.current { + (run, offset) + } else { + // a new run + let run = self.iter.next()?; // no run -> None + self.current = Some((run, 0)); + return self.next_limited(limit); + }; + + match run { + HybridEncoded::Bitmap(values, length) => { + let run_length = length - offset; + + let length = limit.min(run_length); + + if length == run_length { + self.current = None; + } else { + self.current = Some((run, offset + length)); + } + + Some(FilteredHybridEncoded::Bitmap { + values, + offset, + length, + }) + } + HybridEncoded::Repeated(is_set, run_length) => { + let run_length = run_length - offset; + + let length = limit.min(run_length); + + if length == run_length { + self.current = None; + } else { + self.current = Some((run, offset + length)); + } + + Some(FilteredHybridEncoded::Repeated { is_set, length }) + } + } + } +} + +impl<'a> PageValidity<'a> for OptionalPageValidity<'a> { + fn next_limited(&mut self, limit: usize) -> Option> { + self.next_limited(limit) } } /// Extends a [`Pushable`] from an iterator of non-null values and an hybrid-rle decoder pub(super) fn extend_from_decoder<'a, T: Default, P: Pushable, I: Iterator>( validity: &mut MutableBitmap, - page_validity: &mut OptionalPageValidity<'a>, + page_validity: &mut dyn PageValidity<'a>, limit: Option, - values: &mut P, + pushable: &mut P, mut values_iter: I, ) { let limit = limit.unwrap_or(usize::MAX); @@ -155,69 +303,42 @@ pub(super) fn extend_from_decoder<'a, T: Default, P: Pushable, I: Iterator { - // a pack has at most `pack.len() * 8` bits - // during execution, we may end in the middle of a pack (run_offset != 0) - // the remaining items in the pack is dictacted by a combination - // of the page length, the offset in the pack, and where we are in the page - let pack_size = pack.len() * 8 - page_validity.run_offset; - let remaining = page_validity.length - page_validity.consumed; - let length = std::cmp::min(pack_size, remaining); - - let additional = limit.min(length); - - // consume `additional` items - let iter = BitmapIter::new(pack, page_validity.run_offset, additional); - for is_valid in iter { - if is_valid { - values.push(values_iter.next().unwrap()) - } else { - values.push_null() - }; - } - - validity.extend_from_slice(pack, page_validity.run_offset, additional); - - if additional == length { - page_validity.run_offset = 0 - } else { - page_validity.run_offset += additional; - }; - consumed_here += additional; - page_validity.consumed += additional; - } - &hybrid_rle::HybridEncoded::Rle(value, length) => { - let is_set = value[0] == 1; - let length = length - page_validity.run_offset; - - // the number of elements that will be consumed in this (run, iteration) - let additional = limit.min(length); - - validity.extend_constant(additional, is_set); - if is_set { - (0..additional).for_each(|_| values.push(values_iter.next().unwrap())); + let run = page_validity.next_limited(limit); + let run = if let Some(run) = run { run } else { break }; + + match run { + FilteredHybridEncoded::Bitmap { + values, + offset, + length, + } => { + // consume `length` items + let iter = BitmapIter::new(values, offset, length); + let iter = Zip::new(iter, &mut values_iter); + + for item in iter { + if let Some(item) = item { + pushable.push(item) } else { - values.extend_constant(additional, T::default()); + pushable.push_null() } + } + validity.extend_from_slice(values, offset, length); - if additional == length { - page_validity.run_offset = 0 - } else { - page_validity.run_offset += additional; - }; - consumed_here += additional; - page_validity.consumed += additional; + consumed_here += length; + } + FilteredHybridEncoded::Repeated { is_set, length } => { + validity.extend_constant(length, is_set); + if is_set { + (0..length).for_each(|_| pushable.push(values_iter.next().unwrap())); + } else { + pushable.extend_constant(length, T::default()); } - }; - } else { - break; - } + + consumed_here += length; + } + FilteredHybridEncoded::Skipped(valids) => for _ in values_iter.by_ref().take(valids) {}, + }; } } @@ -335,14 +456,13 @@ pub(super) fn next<'a, I: DataPages, D: Decoder<'a>>( } #[inline] -pub(super) fn dict_indices_decoder( - indices_buffer: &[u8], - additional: usize, -) -> hybrid_rle::HybridRleDecoder { +pub(super) fn dict_indices_decoder(page: &DataPage) -> hybrid_rle::HybridRleDecoder { + let (_, _, indices_buffer) = split_buffer(page); + // SPEC: Data page format: the bit width used to encode the entry ids stored as 1 byte (max bit width = 32), // SPEC: followed by the values encoded using RLE/Bit packed described above (with the given bit width). let bit_width = indices_buffer[0]; let indices_buffer = &indices_buffer[1..]; - hybrid_rle::HybridRleDecoder::new(indices_buffer, bit_width as u32, additional) + hybrid_rle::HybridRleDecoder::new(indices_buffer, bit_width as u32, page.num_values()) } diff --git a/src/io/parquet/read/indexes/binary.rs b/src/io/parquet/read/indexes/binary.rs new file mode 100644 index 00000000000..f67e94d86c3 --- /dev/null +++ b/src/io/parquet/read/indexes/binary.rs @@ -0,0 +1,43 @@ +use parquet2::indexes::PageIndex; + +use crate::{ + array::{Array, BinaryArray, PrimitiveArray, Utf8Array}, + datatypes::{DataType, PhysicalType}, + error::ArrowError, + trusted_len::TrustedLen, +}; + +use super::ColumnIndex; + +pub fn deserialize( + indexes: &[PageIndex>], + data_type: &DataType, +) -> Result { + Ok(ColumnIndex { + min: deserialize_binary_iter(indexes.iter().map(|index| index.min.as_ref()), data_type)?, + max: deserialize_binary_iter(indexes.iter().map(|index| index.max.as_ref()), data_type)?, + null_count: PrimitiveArray::from_trusted_len_iter( + indexes + .iter() + .map(|index| index.null_count.map(|x| x as u64)), + ), + }) +} + +fn deserialize_binary_iter<'a, I: TrustedLen>>>( + iter: I, + data_type: &DataType, +) -> Result, ArrowError> { + match data_type.to_physical_type() { + PhysicalType::LargeBinary => Ok(Box::new(BinaryArray::::from_iter(iter))), + PhysicalType::Utf8 => { + let iter = iter.map(|x| x.map(|x| std::str::from_utf8(x)).transpose()); + Ok(Box::new(Utf8Array::::try_from_trusted_len_iter(iter)?)) + } + PhysicalType::LargeUtf8 => { + let iter = iter.map(|x| x.map(|x| std::str::from_utf8(x)).transpose()); + Ok(Box::new(Utf8Array::::try_from_trusted_len_iter(iter)?)) + } + _ => Ok(Box::new(BinaryArray::::from_iter(iter))), + } +} diff --git a/src/io/parquet/read/indexes/boolean.rs b/src/io/parquet/read/indexes/boolean.rs new file mode 100644 index 00000000000..501c9e63a64 --- /dev/null +++ b/src/io/parquet/read/indexes/boolean.rs @@ -0,0 +1,21 @@ +use parquet2::indexes::PageIndex; + +use crate::array::{BooleanArray, PrimitiveArray}; + +use super::ColumnIndex; + +pub fn deserialize(indexes: &[PageIndex]) -> ColumnIndex { + ColumnIndex { + min: Box::new(BooleanArray::from_trusted_len_iter( + indexes.iter().map(|index| index.min), + )), + max: Box::new(BooleanArray::from_trusted_len_iter( + indexes.iter().map(|index| index.max), + )), + null_count: PrimitiveArray::from_trusted_len_iter( + indexes + .iter() + .map(|index| index.null_count.map(|x| x as u64)), + ), + } +} diff --git a/src/io/parquet/read/indexes/fixed_len_binary.rs b/src/io/parquet/read/indexes/fixed_len_binary.rs new file mode 100644 index 00000000000..c4499814d12 --- /dev/null +++ b/src/io/parquet/read/indexes/fixed_len_binary.rs @@ -0,0 +1,58 @@ +use parquet2::indexes::PageIndex; + +use crate::{ + array::{Array, FixedSizeBinaryArray, MutableFixedSizeBinaryArray, PrimitiveArray}, + datatypes::{DataType, PhysicalType, PrimitiveType}, + trusted_len::TrustedLen, +}; + +use super::ColumnIndex; + +pub fn deserialize(indexes: &[PageIndex>], data_type: DataType) -> ColumnIndex { + ColumnIndex { + min: deserialize_binary_iter( + indexes.iter().map(|index| index.min.as_ref()), + data_type.clone(), + ), + max: deserialize_binary_iter(indexes.iter().map(|index| index.max.as_ref()), data_type), + null_count: PrimitiveArray::from_trusted_len_iter( + indexes + .iter() + .map(|index| index.null_count.map(|x| x as u64)), + ), + } +} + +fn deserialize_binary_iter<'a, I: TrustedLen>>>( + iter: I, + data_type: DataType, +) -> Box { + match data_type.to_physical_type() { + PhysicalType::Primitive(PrimitiveType::Int128) => { + Box::new(PrimitiveArray::from_trusted_len_iter(iter.map(|v| { + v.map(|x| { + // Copy the fixed-size byte value to the start of a 16 byte stack + // allocated buffer, then use an arithmetic right shift to fill in + // MSBs, which accounts for leading 1's in negative (two's complement) + // values. + let n = x.len(); + let mut bytes = [0u8; 16]; + bytes[..n].copy_from_slice(x); + i128::from_be_bytes(bytes) >> (8 * (16 - n)) + }) + }))) + } + _ => { + let mut a = MutableFixedSizeBinaryArray::from_data( + data_type, + Vec::with_capacity(iter.size_hint().0), + None, + ); + for item in iter { + a.push(item); + } + let a: FixedSizeBinaryArray = a.into(); + Box::new(a) + } + } +} diff --git a/src/io/parquet/read/indexes/mod.rs b/src/io/parquet/read/indexes/mod.rs new file mode 100644 index 00000000000..329fed1a3ff --- /dev/null +++ b/src/io/parquet/read/indexes/mod.rs @@ -0,0 +1,141 @@ +use parquet2::indexes::{ + BooleanIndex, ByteIndex, FixedLenByteIndex, Index as ParquetIndex, NativeIndex, +}; +use parquet2::metadata::ColumnChunkMetaData; +use parquet2::read::read_columns_indexes as _read_columns_indexes; +use parquet2::schema::types::PhysicalType as ParquetPhysicalType; + +mod binary; +mod boolean; +mod fixed_len_binary; +mod primitive; + +use std::io::{Read, Seek}; + +use crate::datatypes::Field; +use crate::{ + array::{Array, UInt64Array}, + datatypes::DataType, + error::ArrowError, +}; + +/// Arrow-deserialized [`ColumnIndex`] containing the minimum and maximum value +/// of every page from the column. +/// # Invariants +/// The minimum and maximum are guaranteed to have the same logical type. +#[derive(Debug, PartialEq)] +pub struct ColumnIndex { + /// The minimum values in the pages + pub min: Box, + /// The maximum values in the pages + pub max: Box, + /// The number of null values in the pages + pub null_count: UInt64Array, +} + +impl ColumnIndex { + /// The [`DataType`] of the column index. + pub fn data_type(&self) -> &DataType { + self.min.data_type() + } +} + +/// Given a sequence of [`ParquetIndex`] representing the page indexes of each column in the +/// parquet file, returns the page-level statistics as arrow's arrays, as a vector of [`ColumnIndex`]. +/// +/// This function maps timestamps, decimal types, etc. accordingly. +/// # Implementation +/// This function is CPU-bounded but `O(P)` where `P` is the total number of pages in all columns. +/// # Error +/// This function errors iff the value is not deserializable to arrow (e.g. invalid utf-8) +fn deserialize( + indexes: &[Box], + data_types: Vec, +) -> Result, ArrowError> { + indexes + .iter() + .zip(data_types.into_iter()) + .map(|(index, data_type)| match index.physical_type() { + ParquetPhysicalType::Boolean => { + let index = index.as_any().downcast_ref::().unwrap(); + Ok(boolean::deserialize(&index.indexes)) + } + ParquetPhysicalType::Int32 => { + let index = index.as_any().downcast_ref::>().unwrap(); + Ok(primitive::deserialize_i32(&index.indexes, data_type)) + } + ParquetPhysicalType::Int64 => { + let index = index.as_any().downcast_ref::>().unwrap(); + Ok(primitive::deserialize_i64( + &index.indexes, + &index.primitive_type, + data_type, + )) + } + ParquetPhysicalType::Int96 => { + let index = index + .as_any() + .downcast_ref::>() + .unwrap(); + Ok(primitive::deserialize_i96(&index.indexes, data_type)) + } + ParquetPhysicalType::Float => { + let index = index.as_any().downcast_ref::>().unwrap(); + Ok(primitive::deserialize_id(&index.indexes, data_type)) + } + ParquetPhysicalType::Double => { + let index = index.as_any().downcast_ref::>().unwrap(); + Ok(primitive::deserialize_id(&index.indexes, data_type)) + } + ParquetPhysicalType::ByteArray => { + let index = index.as_any().downcast_ref::().unwrap(); + binary::deserialize(&index.indexes, &data_type) + } + ParquetPhysicalType::FixedLenByteArray(_) => { + let index = index.as_any().downcast_ref::().unwrap(); + Ok(fixed_len_binary::deserialize(&index.indexes, data_type)) + } + }) + .collect() +} + +// recursive function to get the corresponding leaf data_types corresponding to the +// parquet columns +fn populate_dt(data_type: &DataType, container: &mut Vec) { + match data_type.to_logical_type() { + DataType::List(inner) => populate_dt(&inner.data_type, container), + DataType::LargeList(inner) => populate_dt(&inner.data_type, container), + DataType::Dictionary(_, inner, _) => populate_dt(inner, container), + DataType::Struct(fields) => fields + .iter() + .for_each(|f| populate_dt(&f.data_type, container)), + _ => container.push(data_type.clone()), + } +} + +/// Reads the column indexes from the reader assuming a valid set of derived Arrow fields +/// for all parquet the columns in the file. +/// +/// This function is expected to be used to filter out parquet pages. +/// +/// # Implementation +/// This function is IO-bounded and calls `reader.read_exact` exactly once. +/// # Error +/// Errors iff the indexes can't be read or their deserialization to arrow is incorrect (e.g. invalid utf-8) +pub fn read_columns_indexes( + reader: &mut R, + chunks: &[ColumnChunkMetaData], + fields: &[Field], +) -> Result, ArrowError> { + let indexes = _read_columns_indexes(reader, chunks)?; + + // map arrow fields to the corresponding columns in parquet taking into account + // that fields may be nested but parquet column indexes are only leaf columns + let mut data_types = vec![]; + fields + .iter() + .map(|f| &f.data_type) + .for_each(|d| populate_dt(d, &mut data_types)); + + deserialize(&indexes, data_types) +} diff --git a/src/io/parquet/read/indexes/primitive.rs b/src/io/parquet/read/indexes/primitive.rs new file mode 100644 index 00000000000..103d67bbcf1 --- /dev/null +++ b/src/io/parquet/read/indexes/primitive.rs @@ -0,0 +1,204 @@ +use parquet2::indexes::PageIndex; +use parquet2::schema::types::{PrimitiveLogicalType, PrimitiveType, TimeUnit as ParquetTimeUnit}; +use parquet2::types::int96_to_i64_ns; + +use crate::array::{Array, MutablePrimitiveArray, PrimitiveArray}; +use crate::datatypes::{DataType, TimeUnit}; +use crate::trusted_len::TrustedLen; +use crate::types::NativeType; + +use super::ColumnIndex; + +#[inline] +fn deserialize_int32>>( + iter: I, + data_type: DataType, +) -> Box { + use DataType::*; + match data_type.to_logical_type() { + UInt8 => Box::new( + PrimitiveArray::::from_trusted_len_iter(iter.map(|x| x.map(|x| x as u8))) + .to(data_type), + ) as _, + UInt16 => Box::new( + PrimitiveArray::::from_trusted_len_iter(iter.map(|x| x.map(|x| x as u16))) + .to(data_type), + ), + UInt32 => Box::new( + PrimitiveArray::::from_trusted_len_iter(iter.map(|x| x.map(|x| x as u32))) + .to(data_type), + ), + Decimal(_, _) => Box::new( + PrimitiveArray::::from_trusted_len_iter(iter.map(|x| x.map(|x| x as i128))) + .to(data_type), + ), + _ => Box::new(PrimitiveArray::::from_trusted_len_iter(iter).to(data_type)), + } +} + +#[inline] +fn timestamp( + array: &mut MutablePrimitiveArray, + time_unit: TimeUnit, + logical_type: Option, +) { + let unit = if let Some(PrimitiveLogicalType::Timestamp { unit, .. }) = logical_type { + unit + } else { + return; + }; + + match (unit, time_unit) { + (ParquetTimeUnit::Milliseconds, TimeUnit::Second) => array + .values_mut_slice() + .iter_mut() + .for_each(|x| *x /= 1_000), + (ParquetTimeUnit::Microseconds, TimeUnit::Second) => array + .values_mut_slice() + .iter_mut() + .for_each(|x| *x /= 1_000_000), + (ParquetTimeUnit::Nanoseconds, TimeUnit::Second) => array + .values_mut_slice() + .iter_mut() + .for_each(|x| *x /= 1_000_000_000), + + (ParquetTimeUnit::Milliseconds, TimeUnit::Millisecond) => {} + (ParquetTimeUnit::Microseconds, TimeUnit::Millisecond) => array + .values_mut_slice() + .iter_mut() + .for_each(|x| *x /= 1_000), + (ParquetTimeUnit::Nanoseconds, TimeUnit::Millisecond) => array + .values_mut_slice() + .iter_mut() + .for_each(|x| *x /= 1_000_000), + + (ParquetTimeUnit::Milliseconds, TimeUnit::Microsecond) => array + .values_mut_slice() + .iter_mut() + .for_each(|x| *x *= 1_000), + (ParquetTimeUnit::Microseconds, TimeUnit::Microsecond) => {} + (ParquetTimeUnit::Nanoseconds, TimeUnit::Microsecond) => array + .values_mut_slice() + .iter_mut() + .for_each(|x| *x /= 1_000), + + (ParquetTimeUnit::Milliseconds, TimeUnit::Nanosecond) => array + .values_mut_slice() + .iter_mut() + .for_each(|x| *x *= 1_000_000), + (ParquetTimeUnit::Microseconds, TimeUnit::Nanosecond) => array + .values_mut_slice() + .iter_mut() + .for_each(|x| *x /= 1_000), + (ParquetTimeUnit::Nanoseconds, TimeUnit::Nanosecond) => {} + } +} + +#[inline] +fn deserialize_int64>>( + iter: I, + primitive_type: &PrimitiveType, + data_type: DataType, +) -> Box { + use DataType::*; + match data_type.to_logical_type() { + UInt64 => Box::new( + PrimitiveArray::::from_trusted_len_iter(iter.map(|x| x.map(|x| x as u64))) + .to(data_type), + ) as _, + Decimal(_, _) => Box::new( + PrimitiveArray::::from_trusted_len_iter(iter.map(|x| x.map(|x| x as i128))) + .to(data_type), + ) as _, + Timestamp(time_unit, _) => { + let mut array = + MutablePrimitiveArray::::from_trusted_len_iter(iter).to(data_type.clone()); + + timestamp(&mut array, *time_unit, primitive_type.logical_type); + + let array: PrimitiveArray = array.into(); + + Box::new(array) + } + _ => Box::new(PrimitiveArray::::from_trusted_len_iter(iter).to(data_type)), + } +} + +#[inline] +fn deserialize_int96>>( + iter: I, + data_type: DataType, +) -> Box { + Box::new( + PrimitiveArray::::from_trusted_len_iter(iter.map(|x| x.map(int96_to_i64_ns))) + .to(data_type), + ) +} + +#[inline] +fn deserialize_id_s>>( + iter: I, + data_type: DataType, +) -> Box { + Box::new(PrimitiveArray::::from_trusted_len_iter(iter).to(data_type)) +} + +pub fn deserialize_i32(indexes: &[PageIndex], data_type: DataType) -> ColumnIndex { + ColumnIndex { + min: deserialize_int32(indexes.iter().map(|index| index.min), data_type.clone()), + max: deserialize_int32(indexes.iter().map(|index| index.max), data_type), + null_count: PrimitiveArray::from_trusted_len_iter( + indexes + .iter() + .map(|index| index.null_count.map(|x| x as u64)), + ), + } +} + +pub fn deserialize_i64( + indexes: &[PageIndex], + primitive_type: &PrimitiveType, + data_type: DataType, +) -> ColumnIndex { + ColumnIndex { + min: deserialize_int64( + indexes.iter().map(|index| index.min), + primitive_type, + data_type.clone(), + ), + max: deserialize_int64( + indexes.iter().map(|index| index.max), + primitive_type, + data_type, + ), + null_count: PrimitiveArray::from_trusted_len_iter( + indexes + .iter() + .map(|index| index.null_count.map(|x| x as u64)), + ), + } +} + +pub fn deserialize_i96(indexes: &[PageIndex<[u32; 3]>], data_type: DataType) -> ColumnIndex { + ColumnIndex { + min: deserialize_int96(indexes.iter().map(|index| index.min), data_type.clone()), + max: deserialize_int96(indexes.iter().map(|index| index.max), data_type), + null_count: PrimitiveArray::from_trusted_len_iter( + indexes + .iter() + .map(|index| index.null_count.map(|x| x as u64)), + ), + } +} + +pub fn deserialize_id(indexes: &[PageIndex], data_type: DataType) -> ColumnIndex { + ColumnIndex { + min: deserialize_id_s(indexes.iter().map(|index| index.min), data_type.clone()), + max: deserialize_id_s(indexes.iter().map(|index| index.max), data_type), + null_count: PrimitiveArray::from_trusted_len_iter( + indexes + .iter() + .map(|index| index.null_count.map(|x| x as u64)), + ), + } +} diff --git a/src/io/parquet/read/mod.rs b/src/io/parquet/read/mod.rs index 14bcbef3c02..dfac4c38df4 100644 --- a/src/io/parquet/read/mod.rs +++ b/src/io/parquet/read/mod.rs @@ -3,45 +3,47 @@ mod deserialize; mod file; +mod indexes; mod row_group; pub mod schema; pub mod statistics; +use std::{ + io::{Read, Seek}, + sync::Arc, +}; + use futures::{AsyncRead, AsyncSeek}; // re-exports of parquet2's relevant APIs pub use parquet2::{ - error::ParquetError, + error::Error as ParquetError, fallible_streaming_iterator, metadata::{ColumnChunkMetaData, ColumnDescriptor, RowGroupMetaData}, page::{CompressedDataPage, DataPage, DataPageHeader}, read::{ decompress, get_column_iterator, get_page_iterator as _get_page_iterator, - get_page_stream as _get_page_stream, read_metadata as _read_metadata, - read_metadata_async as _read_metadata_async, BasicDecompressor, ColumnChunkIter, - Decompressor, MutStreamingIterator, PageFilter, PageIterator, ReadColumnIterator, State, + get_page_stream as _get_page_stream, read_columns_indexes as _read_columns_indexes, + read_metadata as _read_metadata, read_metadata_async as _read_metadata_async, + read_pages_locations, BasicDecompressor, ColumnChunkIter, Decompressor, + MutStreamingIterator, PageFilter, PageReader, ReadColumnIterator, State, }, schema::types::{ - LogicalType, ParquetType, PhysicalType, PrimitiveConvertedType, - TimeUnit as ParquetTimeUnit, TimestampType, + GroupLogicalType, ParquetType, PhysicalType, PrimitiveConvertedType, PrimitiveLogicalType, + TimeUnit as ParquetTimeUnit, }, types::int96_to_i64_ns, FallibleStreamingIterator, }; +use crate::{array::Array, error::Result}; + pub use deserialize::{column_iter_to_arrays, get_page_iterator}; pub use file::{FileReader, RowGroupReader}; +pub use indexes::{read_columns_indexes, ColumnIndex}; pub use row_group::*; -pub(crate) use schema::is_type_nullable; pub use schema::{infer_schema, FileMetaData}; -use std::{ - io::{Read, Seek}, - sync::Arc, -}; - -use crate::{array::Array, error::Result}; - /// Trait describing a [`FallibleStreamingIterator`] of [`DataPage`] pub trait DataPages: FallibleStreamingIterator + Send + Sync diff --git a/src/io/parquet/read/row_group.rs b/src/io/parquet/read/row_group.rs index f83a65eadbe..b53e60f271f 100644 --- a/src/io/parquet/read/row_group.rs +++ b/src/io/parquet/read/row_group.rs @@ -9,7 +9,7 @@ use futures::{ }; use parquet2::{ metadata::ColumnChunkMetaData, - read::{BasicDecompressor, PageIterator}, + read::{BasicDecompressor, PageReader}, }; use crate::{ @@ -95,7 +95,7 @@ pub(super) fn get_field_columns<'a>( ) -> Vec<&'a ColumnChunkMetaData> { columns .iter() - .filter(|x| x.descriptor().path_in_schema()[0] == field_name) + .filter(|x| x.descriptor().path_in_schema[0] == field_name) .collect() } @@ -181,17 +181,15 @@ pub fn to_deserializer<'a>( let (columns, types): (Vec<_>, Vec<_>) = columns .into_iter() .map(|(column_meta, chunk)| { - let pages = PageIterator::new( + let pages = PageReader::new( std::io::Cursor::new(chunk), - column_meta.num_values(), - column_meta.compression(), - column_meta.descriptor().clone(), + column_meta, Arc::new(|_, _| true), vec![], ); ( BasicDecompressor::new(pages, vec![]), - column_meta.descriptor().type_(), + &column_meta.descriptor().descriptor.primitive_type, ) }) .unzip(); diff --git a/src/io/parquet/read/schema/convert.rs b/src/io/parquet/read/schema/convert.rs index ae2d66a1b9b..9e32ee30572 100644 --- a/src/io/parquet/read/schema/convert.rs +++ b/src/io/parquet/read/schema/convert.rs @@ -1,8 +1,8 @@ //! This module has a single entry point, [`parquet_to_arrow_schema`]. use parquet2::schema::{ types::{ - BasicTypeInfo, GroupConvertedType, LogicalType, ParquetType, PhysicalType, - PrimitiveConvertedType, TimeUnit as ParquetTimeUnit, TimestampType, + FieldInfo, GroupConvertedType, GroupLogicalType, IntegerType, ParquetType, PhysicalType, + PrimitiveConvertedType, PrimitiveLogicalType, PrimitiveType, TimeUnit as ParquetTimeUnit, }, Repetition, }; @@ -16,28 +16,27 @@ pub fn parquet_to_arrow_schema(fields: &[ParquetType]) -> Vec { } fn from_int32( - logical_type: &Option, - converted_type: &Option, + logical_type: Option, + converted_type: Option, ) -> DataType { + use PrimitiveLogicalType::*; match (logical_type, converted_type) { // handle logical types first - (Some(LogicalType::INTEGER(t)), _) => match (t.bit_width, t.is_signed) { - (8, true) => DataType::Int8, - (16, true) => DataType::Int16, - (32, true) => DataType::Int32, - (8, false) => DataType::UInt8, - (16, false) => DataType::UInt16, - (32, false) => DataType::UInt32, + (Some(Integer(t)), _) => match t { + IntegerType::Int8 => DataType::Int8, + IntegerType::Int16 => DataType::Int16, + IntegerType::Int32 => DataType::Int32, + IntegerType::UInt8 => DataType::UInt8, + IntegerType::UInt16 => DataType::UInt16, + IntegerType::UInt32 => DataType::UInt32, // The above are the only possible annotations for parquet's int32. Anything else // is a deviation to the parquet specification and we ignore _ => DataType::Int32, }, - (Some(LogicalType::DECIMAL(t)), _) => { - DataType::Decimal(t.precision as usize, t.scale as usize) - } - (Some(LogicalType::DATE(_)), _) => DataType::Date32, - (Some(LogicalType::TIME(t)), _) => match t.unit { - ParquetTimeUnit::MILLIS(_) => DataType::Time32(TimeUnit::Millisecond), + (Some(Decimal(precision, scale)), _) => DataType::Decimal(precision, scale), + (Some(Date), _) => DataType::Date32, + (Some(Time { unit, .. }), _) => match unit { + ParquetTimeUnit::Milliseconds => DataType::Time32(TimeUnit::Millisecond), // MILLIS is the only possible annotation for parquet's int32. Anything else // is a deviation to the parquet specification and we ignore _ => DataType::Int32, @@ -52,30 +51,32 @@ fn from_int32( (_, Some(PrimitiveConvertedType::Date)) => DataType::Date32, (_, Some(PrimitiveConvertedType::TimeMillis)) => DataType::Time32(TimeUnit::Millisecond), (_, Some(PrimitiveConvertedType::Decimal(precision, scale))) => { - DataType::Decimal(*precision as usize, *scale as usize) + DataType::Decimal(precision, scale) } (_, _) => DataType::Int32, } } fn from_int64( - logical_type: &Option, - converted_type: &Option, + logical_type: Option, + converted_type: Option, ) -> DataType { + use PrimitiveLogicalType::*; match (logical_type, converted_type) { // handle logical types first - (Some(LogicalType::INTEGER(t)), _) if t.bit_width == 64 => match t.is_signed { - true => DataType::Int64, - false => DataType::UInt64, + (Some(Integer(integer)), _) => match integer { + IntegerType::UInt64 => DataType::UInt64, + IntegerType::Int64 => DataType::Int64, + _ => DataType::Int64, }, ( - Some(LogicalType::TIMESTAMP(TimestampType { - is_adjusted_to_u_t_c, + Some(Timestamp { + is_adjusted_to_utc, unit, - })), + }), _, ) => { - let timezone = if *is_adjusted_to_u_t_c { + let timezone = if is_adjusted_to_utc { // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md // A TIMESTAMP with isAdjustedToUTC=true is defined as [...] elapsed since the Unix epoch Some("+00:00".to_string()) @@ -93,21 +94,23 @@ fn from_int64( }; match unit { - ParquetTimeUnit::MILLIS(_) => DataType::Timestamp(TimeUnit::Millisecond, timezone), - ParquetTimeUnit::MICROS(_) => DataType::Timestamp(TimeUnit::Microsecond, timezone), - ParquetTimeUnit::NANOS(_) => DataType::Timestamp(TimeUnit::Nanosecond, timezone), + ParquetTimeUnit::Milliseconds => { + DataType::Timestamp(TimeUnit::Millisecond, timezone) + } + ParquetTimeUnit::Microseconds => { + DataType::Timestamp(TimeUnit::Microsecond, timezone) + } + ParquetTimeUnit::Nanoseconds => DataType::Timestamp(TimeUnit::Nanosecond, timezone), } } - (Some(LogicalType::TIME(t)), _) => match t.unit { - ParquetTimeUnit::MICROS(_) => DataType::Time64(TimeUnit::Microsecond), - ParquetTimeUnit::NANOS(_) => DataType::Time64(TimeUnit::Nanosecond), + (Some(Time { unit, .. }), _) => match unit { + ParquetTimeUnit::Microseconds => DataType::Time64(TimeUnit::Microsecond), + ParquetTimeUnit::Nanoseconds => DataType::Time64(TimeUnit::Nanosecond), // MILLIS is only possible for int32. Appearing in int64 is a deviation // to parquet's spec, which we ignore _ => DataType::Int64, }, - (Some(LogicalType::DECIMAL(t)), _) => { - DataType::Decimal(t.precision as usize, t.scale as usize) - } + (Some(Decimal(precision, scale)), _) => DataType::Decimal(precision, scale), // handle converted types: (_, Some(PrimitiveConvertedType::TimeMicros)) => DataType::Time64(TimeUnit::Microsecond), (_, Some(PrimitiveConvertedType::TimestampMillis)) => { @@ -119,7 +122,7 @@ fn from_int64( (_, Some(PrimitiveConvertedType::Int64)) => DataType::Int64, (_, Some(PrimitiveConvertedType::Uint64)) => DataType::UInt64, (_, Some(PrimitiveConvertedType::Decimal(precision, scale))) => { - DataType::Decimal(*precision as usize, *scale as usize) + DataType::Decimal(precision, scale) } (_, _) => DataType::Int64, @@ -127,14 +130,14 @@ fn from_int64( } fn from_byte_array( - logical_type: &Option, + logical_type: &Option, converted_type: &Option, ) -> DataType { match (logical_type, converted_type) { - (Some(LogicalType::STRING(_)), _) => DataType::Utf8, - (Some(LogicalType::JSON(_)), _) => DataType::Binary, - (Some(LogicalType::BSON(_)), _) => DataType::Binary, - (Some(LogicalType::ENUM(_)), _) => DataType::Binary, + (Some(PrimitiveLogicalType::String), _) => DataType::Utf8, + (Some(PrimitiveLogicalType::Json), _) => DataType::Binary, + (Some(PrimitiveLogicalType::Bson), _) => DataType::Binary, + (Some(PrimitiveLogicalType::Enum), _) => DataType::Binary, (_, Some(PrimitiveConvertedType::Json)) => DataType::Binary, (_, Some(PrimitiveConvertedType::Bson)) => DataType::Binary, (_, Some(PrimitiveConvertedType::Enum)) => DataType::Binary, @@ -144,16 +147,16 @@ fn from_byte_array( } fn from_fixed_len_byte_array( - length: &i32, - logical_type: &Option, - converted_type: &Option, + length: usize, + logical_type: Option, + converted_type: Option, ) -> DataType { match (logical_type, converted_type) { - (Some(LogicalType::DECIMAL(t)), _) => { - DataType::Decimal(t.precision as usize, t.scale as usize) + (Some(PrimitiveLogicalType::Decimal(precision, scale)), _) => { + DataType::Decimal(precision, scale) } (None, Some(PrimitiveConvertedType::Decimal(precision, scale))) => { - DataType::Decimal(*precision as usize, *scale as usize) + DataType::Decimal(precision, scale) } (None, Some(PrimitiveConvertedType::Interval)) => { // There is currently no reliable way of determining which IntervalUnit @@ -161,46 +164,45 @@ fn from_fixed_len_byte_array( // would be incorrect if all 12 bytes of the interval are populated DataType::Interval(IntervalUnit::DayTime) } - _ => DataType::FixedSizeBinary(*length as usize), + _ => DataType::FixedSizeBinary(length), } } /// Maps a [`PhysicalType`] with optional metadata to a [`DataType`] -fn to_primitive_type_inner( - physical_type: &PhysicalType, - logical_type: &Option, - converted_type: &Option, -) -> DataType { - match physical_type { +fn to_primitive_type_inner(primitive_type: &PrimitiveType) -> DataType { + match primitive_type.physical_type { PhysicalType::Boolean => DataType::Boolean, - PhysicalType::Int32 => from_int32(logical_type, converted_type), - PhysicalType::Int64 => from_int64(logical_type, converted_type), + PhysicalType::Int32 => { + from_int32(primitive_type.logical_type, primitive_type.converted_type) + } + PhysicalType::Int64 => { + from_int64(primitive_type.logical_type, primitive_type.converted_type) + } PhysicalType::Int96 => DataType::Timestamp(TimeUnit::Nanosecond, None), PhysicalType::Float => DataType::Float32, PhysicalType::Double => DataType::Float64, - PhysicalType::ByteArray => from_byte_array(logical_type, converted_type), - PhysicalType::FixedLenByteArray(length) => { - from_fixed_len_byte_array(length, logical_type, converted_type) + PhysicalType::ByteArray => { + from_byte_array(&primitive_type.logical_type, &primitive_type.converted_type) } + PhysicalType::FixedLenByteArray(length) => from_fixed_len_byte_array( + length, + primitive_type.logical_type, + primitive_type.converted_type, + ), } } /// Entry point for converting parquet primitive type to arrow type. /// /// This function takes care of repetition. -fn to_primitive_type( - basic_info: &BasicTypeInfo, - physical_type: &PhysicalType, - logical_type: &Option, - converted_type: &Option, -) -> DataType { - let base_type = to_primitive_type_inner(physical_type, logical_type, converted_type); +fn to_primitive_type(primitive_type: &PrimitiveType) -> DataType { + let base_type = to_primitive_type_inner(primitive_type); - if basic_info.repetition() == &Repetition::Repeated { + if primitive_type.field_info.repetition == Repetition::Repeated { DataType::List(Box::new(Field::new( - basic_info.name(), + &primitive_type.field_info.name, base_type, - is_nullable(basic_info), + is_nullable(&primitive_type.field_info), ))) } else { base_type @@ -208,14 +210,14 @@ fn to_primitive_type( } fn non_repeated_group( - logical_type: &Option, + logical_type: &Option, converted_type: &Option, fields: &[ParquetType], parent_name: &str, ) -> Option { debug_assert!(!fields.is_empty()); match (logical_type, converted_type) { - (Some(LogicalType::LIST(_)), _) => to_list(fields, parent_name), + (Some(GroupLogicalType::List), _) => to_list(fields, parent_name), (None, Some(GroupConvertedType::List)) => to_list(fields, parent_name), _ => to_struct(fields), } @@ -236,18 +238,18 @@ fn to_struct(fields: &[ParquetType]) -> Option { /// /// This function takes care of logical type and repetition. fn to_group_type( - basic_info: &BasicTypeInfo, - logical_type: &Option, + field_info: &FieldInfo, + logical_type: &Option, converted_type: &Option, fields: &[ParquetType], parent_name: &str, ) -> Option { debug_assert!(!fields.is_empty()); - if basic_info.repetition() == &Repetition::Repeated { + if field_info.repetition == Repetition::Repeated { Some(DataType::List(Box::new(Field::new( - basic_info.name(), + &field_info.name, to_struct(fields)?, - is_nullable(basic_info), + is_nullable(field_info), )))) } else { non_repeated_group(logical_type, converted_type, fields, parent_name) @@ -255,8 +257,8 @@ fn to_group_type( } /// Checks whether this schema is nullable. -pub(crate) fn is_nullable(basic_info: &BasicTypeInfo) -> bool { - match basic_info.repetition() { +pub(crate) fn is_nullable(field_info: &FieldInfo) -> bool { + match field_info.repetition { Repetition::Optional => true, Repetition::Repeated => true, Repetition::Required => false, @@ -268,9 +270,9 @@ pub(crate) fn is_nullable(basic_info: &BasicTypeInfo) -> bool { /// i.e. if it is a column-less group type. fn to_field(type_: &ParquetType) -> Option { Some(Field::new( - type_.get_basic_info().name(), + &type_.get_field_info().name, to_data_type(type_)?, - is_nullable(type_.get_basic_info()), + is_nullable(type_.get_field_info()), )) } @@ -282,16 +284,7 @@ fn to_list(fields: &[ParquetType], parent_name: &str) -> Option { let item = fields.first().unwrap(); let item_type = match item { - ParquetType::PrimitiveType { - physical_type, - logical_type, - converted_type, - .. - } => Some(to_primitive_type_inner( - physical_type, - logical_type, - converted_type, - )), + ParquetType::PrimitiveType(primitive) => Some(to_primitive_type_inner(primitive)), ParquetType::GroupType { fields, .. } => { if fields.len() == 1 && item.name() != "array" @@ -312,17 +305,17 @@ fn to_list(fields: &[ParquetType], parent_name: &str) -> Option { // Without this step, the child incorrectly inherits the parent's optionality let (list_item_name, item_is_optional) = match item { ParquetType::GroupType { - basic_info, fields, .. - } if basic_info.name() == "list" && fields.len() == 1 => { + field_info, fields, .. + } if field_info.name == "list" && fields.len() == 1 => { let field = fields.first().unwrap(); ( - field.name(), - field.get_basic_info().repetition() != &Repetition::Required, + &field.get_field_info().name, + field.get_field_info().repetition != Repetition::Required, ) } _ => ( - item.name(), - item.get_basic_info().repetition() != &Repetition::Required, + &item.get_field_info().name, + item.get_field_info().repetition != Repetition::Required, ), }; @@ -344,19 +337,9 @@ fn to_list(fields: &[ParquetType], parent_name: &str) -> Option { /// conversion, the result is Ok(None). pub(crate) fn to_data_type(type_: &ParquetType) -> Option { match type_ { - ParquetType::PrimitiveType { - basic_info, - physical_type, - logical_type, - converted_type, - } => Some(to_primitive_type( - basic_info, - physical_type, - logical_type, - converted_type, - )), + ParquetType::PrimitiveType(primitive) => Some(to_primitive_type(primitive)), ParquetType::GroupType { - basic_info, + field_info, logical_type, converted_type, fields, @@ -365,11 +348,11 @@ pub(crate) fn to_data_type(type_: &ParquetType) -> Option { None } else { to_group_type( - basic_info, + field_info, logical_type, converted_type, fields, - basic_info.name(), + &field_info.name, ) } } diff --git a/src/io/parquet/read/schema/mod.rs b/src/io/parquet/read/schema/mod.rs index 0c7e7d4d665..17147fb03b5 100644 --- a/src/io/parquet/read/schema/mod.rs +++ b/src/io/parquet/read/schema/mod.rs @@ -28,7 +28,3 @@ pub fn infer_schema(file_metadata: &FileMetaData) -> Result { Schema { fields, metadata } })) } - -pub(crate) fn is_type_nullable(type_: &ParquetType) -> bool { - is_nullable(type_.get_basic_info()) -} diff --git a/src/io/parquet/read/statistics/primitive.rs b/src/io/parquet/read/statistics/primitive.rs index 91a630692df..9ef9ec7d44e 100644 --- a/src/io/parquet/read/statistics/primitive.rs +++ b/src/io/parquet/read/statistics/primitive.rs @@ -1,14 +1,14 @@ -use crate::datatypes::TimeUnit; -use crate::{datatypes::DataType, types::NativeType}; -use parquet2::schema::types::{ - LogicalType, ParquetType, TimeUnit as ParquetTimeUnit, TimestampType, -}; +use std::any::Any; + +use parquet2::schema::types::{PrimitiveLogicalType, PrimitiveType, TimeUnit as ParquetTimeUnit}; use parquet2::statistics::PrimitiveStatistics as ParquetPrimitiveStatistics; use parquet2::types::NativeType as ParquetNativeType; -use std::any::Any; -use super::Statistics; +use crate::datatypes::TimeUnit; use crate::error::Result; +use crate::{datatypes::DataType, types::NativeType}; + +use super::Statistics; /// Arrow-deserialized parquet Statistics of a primitive type #[derive(Debug, Clone, PartialEq)] @@ -74,35 +74,29 @@ pub(super) fn statistics_from_i32( }) } -fn timestamp(type_: &ParquetType, time_unit: TimeUnit, x: i64) -> i64 { - let logical_type = if let ParquetType::PrimitiveType { logical_type, .. } = type_ { - logical_type - } else { - unreachable!() - }; - - let unit = if let Some(LogicalType::TIMESTAMP(TimestampType { unit, .. })) = logical_type { +fn timestamp(type_: &PrimitiveType, time_unit: TimeUnit, x: i64) -> i64 { + let unit = if let Some(PrimitiveLogicalType::Timestamp { unit, .. }) = &type_.logical_type { unit } else { return x; }; match (unit, time_unit) { - (ParquetTimeUnit::MILLIS(_), TimeUnit::Second) => x / 1_000, - (ParquetTimeUnit::MICROS(_), TimeUnit::Second) => x / 1_000_000, - (ParquetTimeUnit::NANOS(_), TimeUnit::Second) => x * 1_000_000_000, + (ParquetTimeUnit::Milliseconds, TimeUnit::Second) => x / 1_000, + (ParquetTimeUnit::Microseconds, TimeUnit::Second) => x / 1_000_000, + (ParquetTimeUnit::Nanoseconds, TimeUnit::Second) => x * 1_000_000_000, - (ParquetTimeUnit::MILLIS(_), TimeUnit::Millisecond) => x, - (ParquetTimeUnit::MICROS(_), TimeUnit::Millisecond) => x / 1_000, - (ParquetTimeUnit::NANOS(_), TimeUnit::Millisecond) => x / 1_000_000, + (ParquetTimeUnit::Milliseconds, TimeUnit::Millisecond) => x, + (ParquetTimeUnit::Microseconds, TimeUnit::Millisecond) => x / 1_000, + (ParquetTimeUnit::Nanoseconds, TimeUnit::Millisecond) => x / 1_000_000, - (ParquetTimeUnit::MILLIS(_), TimeUnit::Microsecond) => x * 1_000, - (ParquetTimeUnit::MICROS(_), TimeUnit::Microsecond) => x, - (ParquetTimeUnit::NANOS(_), TimeUnit::Microsecond) => x / 1_000, + (ParquetTimeUnit::Milliseconds, TimeUnit::Microsecond) => x * 1_000, + (ParquetTimeUnit::Microseconds, TimeUnit::Microsecond) => x, + (ParquetTimeUnit::Nanoseconds, TimeUnit::Microsecond) => x / 1_000, - (ParquetTimeUnit::MILLIS(_), TimeUnit::Nanosecond) => x * 1_000_000, - (ParquetTimeUnit::MICROS(_), TimeUnit::Nanosecond) => x * 1_000, - (ParquetTimeUnit::NANOS(_), TimeUnit::Nanosecond) => x, + (ParquetTimeUnit::Milliseconds, TimeUnit::Nanosecond) => x * 1_000_000, + (ParquetTimeUnit::Microseconds, TimeUnit::Nanosecond) => x * 1_000, + (ParquetTimeUnit::Nanoseconds, TimeUnit::Nanosecond) => x, } } @@ -121,10 +115,10 @@ pub(super) fn statistics_from_i64( distinct_count: stats.distinct_count, min_value: stats .min_value - .map(|x| timestamp(stats.descriptor.type_(), time_unit, x)), + .map(|x| timestamp(&stats.primitive_type, time_unit, x)), max_value: stats .max_value - .map(|x| timestamp(stats.descriptor.type_(), time_unit, x)), + .map(|x| timestamp(&stats.primitive_type, time_unit, x)), }), Decimal(_, _) => Box::new(PrimitiveStatistics::::from((stats, data_type))), _ => Box::new(PrimitiveStatistics::::from((stats, data_type))), diff --git a/src/io/parquet/write/binary/basic.rs b/src/io/parquet/write/binary/basic.rs index 7a7c4cd805e..277a5c192a4 100644 --- a/src/io/parquet/write/binary/basic.rs +++ b/src/io/parquet/write/binary/basic.rs @@ -1,17 +1,18 @@ use parquet2::{ encoding::{delta_bitpacked, Encoding}, - metadata::ColumnDescriptor, + metadata::Descriptor, page::DataPage, + schema::types::PrimitiveType, statistics::{serialize_statistics, BinaryStatistics, ParquetStatistics, Statistics}, - write::WriteOptions, }; use super::super::utils; +use super::super::WriteOptions; use crate::{ array::{Array, BinaryArray, Offset}, bitmap::Bitmap, error::{ArrowError, Result}, - io::parquet::read::is_type_nullable, + io::parquet::read::schema::is_nullable, }; pub(crate) fn encode_plain( @@ -42,11 +43,11 @@ pub(crate) fn encode_plain( pub fn array_to_page( array: &BinaryArray, options: WriteOptions, - descriptor: ColumnDescriptor, + descriptor: Descriptor, encoding: Encoding, ) -> Result { let validity = array.validity(); - let is_optional = is_type_nullable(descriptor.type_()); + let is_optional = is_nullable(&descriptor.primitive_type.field_info); let mut buffer = vec![]; utils::write_def_levels( @@ -78,7 +79,7 @@ pub fn array_to_page( } let statistics = if options.write_statistics { - Some(build_statistics(array, descriptor.clone())) + Some(build_statistics(array, descriptor.primitive_type.clone())) } else { None }; @@ -86,6 +87,7 @@ pub fn array_to_page( utils::build_plain_page( buffer, array.len(), + array.len(), array.null_count(), 0, definition_levels_byte_length, @@ -96,12 +98,12 @@ pub fn array_to_page( ) } -pub(super) fn build_statistics( +pub(crate) fn build_statistics( array: &BinaryArray, - descriptor: ColumnDescriptor, + primitive_type: PrimitiveType, ) -> ParquetStatistics { let statistics = &BinaryStatistics { - descriptor, + primitive_type, null_count: Some(array.null_count() as i64), distinct_count: None, max_value: array diff --git a/src/io/parquet/write/binary/mod.rs b/src/io/parquet/write/binary/mod.rs index 8d9e94cd0fb..e229572b14a 100644 --- a/src/io/parquet/write/binary/mod.rs +++ b/src/io/parquet/write/binary/mod.rs @@ -2,6 +2,7 @@ mod basic; mod nested; pub use basic::array_to_page; +pub(crate) use basic::build_statistics; pub(crate) use basic::encode_plain; pub(super) use basic::{encode_delta, ord_binary}; pub use nested::array_to_page as nested_array_to_page; diff --git a/src/io/parquet/write/binary/nested.rs b/src/io/parquet/write/binary/nested.rs index 9161741c4cb..a6e65e2f7e4 100644 --- a/src/io/parquet/write/binary/nested.rs +++ b/src/io/parquet/write/binary/nested.rs @@ -1,26 +1,25 @@ -use parquet2::{ - encoding::Encoding, metadata::ColumnDescriptor, page::DataPage, write::WriteOptions, -}; +use parquet2::metadata::Descriptor; +use parquet2::{encoding::Encoding, page::DataPage}; -use super::super::{levels, utils}; +use super::super::{levels, utils, WriteOptions}; use super::basic::{build_statistics, encode_plain}; +use crate::io::parquet::read::schema::is_nullable; use crate::{ array::{Array, BinaryArray, Offset}, error::Result, - io::parquet::read::is_type_nullable, }; pub fn array_to_page( array: &BinaryArray, options: WriteOptions, - descriptor: ColumnDescriptor, + descriptor: Descriptor, nested: levels::NestedInfo, ) -> Result where OO: Offset, O: Offset, { - let is_optional = is_type_nullable(descriptor.type_()); + let is_optional = is_nullable(&descriptor.primitive_type.field_info); let validity = array.validity(); @@ -34,7 +33,7 @@ where encode_plain(array, is_optional, &mut buffer); let statistics = if options.write_statistics { - Some(build_statistics(array, descriptor.clone())) + Some(build_statistics(array, descriptor.primitive_type.clone())) } else { None }; @@ -42,6 +41,7 @@ where utils::build_plain_page( buffer, levels::num_values(nested.offsets()), + nested.offsets().len().saturating_sub(1), array.null_count(), repetition_levels_byte_length, definition_levels_byte_length, diff --git a/src/io/parquet/write/boolean/basic.rs b/src/io/parquet/write/boolean/basic.rs index f9046d6d585..643a25cd5b2 100644 --- a/src/io/parquet/write/boolean/basic.rs +++ b/src/io/parquet/write/boolean/basic.rs @@ -1,14 +1,14 @@ use parquet2::{ encoding::{hybrid_rle::bitpacked_encode, Encoding}, - metadata::ColumnDescriptor, + metadata::Descriptor, page::DataPage, statistics::{serialize_statistics, BooleanStatistics, ParquetStatistics, Statistics}, - write::WriteOptions, }; use super::super::utils; -use crate::error::Result; -use crate::{array::*, io::parquet::read::is_type_nullable}; +use super::super::WriteOptions; +use crate::array::*; +use crate::{error::Result, io::parquet::read::schema::is_nullable}; fn encode(iterator: impl Iterator, buffer: &mut Vec) -> Result<()> { // encode values using bitpacking @@ -41,9 +41,9 @@ pub(super) fn encode_plain( pub fn array_to_page( array: &BooleanArray, options: WriteOptions, - descriptor: ColumnDescriptor, + descriptor: Descriptor, ) -> Result { - let is_optional = is_type_nullable(descriptor.type_()); + let is_optional = is_nullable(&descriptor.primitive_type.field_info); let validity = array.validity(); @@ -69,6 +69,7 @@ pub fn array_to_page( utils::build_plain_page( buffer, array.len(), + array.len(), array.null_count(), 0, definition_levels_byte_length, diff --git a/src/io/parquet/write/boolean/nested.rs b/src/io/parquet/write/boolean/nested.rs index 427c7a05925..40645eea3d2 100644 --- a/src/io/parquet/write/boolean/nested.rs +++ b/src/io/parquet/write/boolean/nested.rs @@ -1,25 +1,23 @@ -use parquet2::{ - encoding::Encoding, metadata::ColumnDescriptor, page::DataPage, write::WriteOptions, -}; +use parquet2::{encoding::Encoding, metadata::Descriptor, page::DataPage}; -use super::super::{levels, utils}; +use super::super::{levels, utils, WriteOptions}; use super::basic::{build_statistics, encode_plain}; +use crate::io::parquet::read::schema::is_nullable; use crate::{ array::{Array, BooleanArray, Offset}, error::Result, - io::parquet::read::is_type_nullable, }; pub fn array_to_page( array: &BooleanArray, options: WriteOptions, - descriptor: ColumnDescriptor, + descriptor: Descriptor, nested: levels::NestedInfo, ) -> Result where O: Offset, { - let is_optional = is_type_nullable(descriptor.type_()); + let is_optional = is_nullable(&descriptor.primitive_type.field_info); let validity = array.validity(); @@ -41,6 +39,7 @@ where utils::build_plain_page( buffer, levels::num_values(nested.offsets()), + nested.offsets().len().saturating_sub(1), array.null_count(), repetition_levels_byte_length, definition_levels_byte_length, diff --git a/src/io/parquet/write/dictionary.rs b/src/io/parquet/write/dictionary.rs index 521c863aac7..7a36f2bbef1 100644 --- a/src/io/parquet/write/dictionary.rs +++ b/src/io/parquet/write/dictionary.rs @@ -1,29 +1,37 @@ use parquet2::{ encoding::{hybrid_rle::encode_u32, Encoding}, - metadata::ColumnDescriptor, + metadata::Descriptor, page::{EncodedDictPage, EncodedPage}, - write::{DynIter, WriteOptions}, + statistics::ParquetStatistics, + write::DynIter, }; +use super::binary::build_statistics as binary_build_statistics; use super::binary::encode_plain as binary_encode_plain; +use super::fixed_len_bytes::build_statistics as fixed_binary_build_statistics; use super::fixed_len_bytes::encode_plain as fixed_binary_encode_plain; +use super::primitive::build_statistics as primitive_build_statistics; use super::primitive::encode_plain as primitive_encode_plain; +use super::utf8::build_statistics as utf8_build_statistics; use super::utf8::encode_plain as utf8_encode_plain; -use crate::array::{Array, DictionaryArray, DictionaryKey, PrimitiveArray}; +use super::WriteOptions; use crate::bitmap::Bitmap; use crate::datatypes::DataType; use crate::error::{ArrowError, Result}; -use crate::io::parquet::read::is_type_nullable; use crate::io::parquet::write::utils; +use crate::{ + array::{Array, DictionaryArray, DictionaryKey, PrimitiveArray}, + io::parquet::read::schema::is_nullable, +}; fn encode_keys( array: &PrimitiveArray, - // todo: merge this to not discard values' validity validity: Option<&Bitmap>, - descriptor: ColumnDescriptor, + descriptor: Descriptor, + statistics: ParquetStatistics, options: WriteOptions, ) -> Result { - let is_optional = is_type_nullable(descriptor.type_()); + let is_optional = is_nullable(&descriptor.primitive_type.field_info); let mut buffer = vec![]; @@ -94,10 +102,11 @@ fn encode_keys( utils::build_plain_page( buffer, array.len(), + array.len(), array.null_count(), 0, definition_levels_byte_length, - None, + Some(statistics), descriptor, options, Encoding::RleDictionary, @@ -106,74 +115,83 @@ fn encode_keys( } macro_rules! dyn_prim { - ($from:ty, $to:ty, $array:expr, $options:expr) => {{ + ($from:ty, $to:ty, $array:expr, $options:expr, $descriptor:expr) => {{ let values = $array.values().as_any().downcast_ref().unwrap(); let mut buffer = vec![]; primitive_encode_plain::<$from, $to>(values, false, &mut buffer); - EncodedDictPage::new(buffer, values.len()) + ( + EncodedDictPage::new(buffer, values.len()), + primitive_build_statistics::<$from, $to>(values, $descriptor.primitive_type.clone()), + ) }}; } pub fn array_to_pages( array: &DictionaryArray, - descriptor: ColumnDescriptor, + descriptor: Descriptor, options: WriteOptions, encoding: Encoding, ) -> Result>> { match encoding { Encoding::PlainDictionary | Encoding::RleDictionary => { // write DictPage - let dict_page = match array.values().data_type().to_logical_type() { - DataType::Int8 => dyn_prim!(i8, i32, array, options), - DataType::Int16 => dyn_prim!(i16, i32, array, options), + let (dict_page, statistics) = match array.values().data_type().to_logical_type() { + DataType::Int8 => dyn_prim!(i8, i32, array, options, descriptor), + DataType::Int16 => dyn_prim!(i16, i32, array, options, descriptor), DataType::Int32 | DataType::Date32 | DataType::Time32(_) => { - dyn_prim!(i32, i32, array, options) + dyn_prim!(i32, i32, array, options, descriptor) } DataType::Int64 | DataType::Date64 | DataType::Time64(_) | DataType::Timestamp(_, _) - | DataType::Duration(_) => dyn_prim!(i64, i64, array, options), - DataType::UInt8 => dyn_prim!(u8, i32, array, options), - DataType::UInt16 => dyn_prim!(u16, i32, array, options), - DataType::UInt32 => dyn_prim!(u32, i32, array, options), - DataType::UInt64 => dyn_prim!(i64, i64, array, options), - DataType::Float32 => dyn_prim!(f32, f32, array, options), - DataType::Float64 => dyn_prim!(f64, f64, array, options), + | DataType::Duration(_) => dyn_prim!(i64, i64, array, options, descriptor), + DataType::UInt8 => dyn_prim!(u8, i32, array, options, descriptor), + DataType::UInt16 => dyn_prim!(u16, i32, array, options, descriptor), + DataType::UInt32 => dyn_prim!(u32, i32, array, options, descriptor), + DataType::UInt64 => dyn_prim!(i64, i64, array, options, descriptor), + DataType::Float32 => dyn_prim!(f32, f32, array, options, descriptor), + DataType::Float64 => dyn_prim!(f64, f64, array, options, descriptor), DataType::Utf8 => { - let values = array.values().as_any().downcast_ref().unwrap(); + let array = array.values().as_any().downcast_ref().unwrap(); let mut buffer = vec![]; - utf8_encode_plain::(values, false, &mut buffer); - EncodedDictPage::new(buffer, values.len()) + utf8_encode_plain::(array, false, &mut buffer); + let stats = utf8_build_statistics(array, descriptor.primitive_type.clone()); + (EncodedDictPage::new(buffer, array.len()), stats) } DataType::LargeUtf8 => { - let values = array.values().as_any().downcast_ref().unwrap(); + let array = array.values().as_any().downcast_ref().unwrap(); let mut buffer = vec![]; - utf8_encode_plain::(values, false, &mut buffer); - EncodedDictPage::new(buffer, values.len()) + utf8_encode_plain::(array, false, &mut buffer); + let stats = utf8_build_statistics(array, descriptor.primitive_type.clone()); + (EncodedDictPage::new(buffer, array.len()), stats) } DataType::Binary => { - let values = array.values().as_any().downcast_ref().unwrap(); + let array = array.values().as_any().downcast_ref().unwrap(); let mut buffer = vec![]; - binary_encode_plain::(values, false, &mut buffer); - EncodedDictPage::new(buffer, values.len()) + binary_encode_plain::(array, false, &mut buffer); + let stats = binary_build_statistics(array, descriptor.primitive_type.clone()); + (EncodedDictPage::new(buffer, array.len()), stats) } DataType::LargeBinary => { - let values = array.values().as_any().downcast_ref().unwrap(); + let array = array.values().as_any().downcast_ref().unwrap(); let mut buffer = vec![]; - binary_encode_plain::(values, false, &mut buffer); - EncodedDictPage::new(buffer, values.len()) + binary_encode_plain::(array, false, &mut buffer); + let stats = binary_build_statistics(array, descriptor.primitive_type.clone()); + (EncodedDictPage::new(buffer, array.len()), stats) } DataType::FixedSizeBinary(_) => { let mut buffer = vec![]; let array = array.values().as_any().downcast_ref().unwrap(); fixed_binary_encode_plain(array, false, &mut buffer); - EncodedDictPage::new(buffer, array.len()) + let stats = + fixed_binary_build_statistics(array, descriptor.primitive_type.clone()); + (EncodedDictPage::new(buffer, array.len()), stats) } other => { return Err(ArrowError::NotYetImplemented(format!( @@ -185,8 +203,13 @@ pub fn array_to_pages( let dict_page = EncodedPage::Dict(dict_page); // write DataPage pointing to DictPage - let data_page = - encode_keys(array.keys(), array.values().validity(), descriptor, options)?; + let data_page = encode_keys( + array.keys(), + array.values().validity(), + descriptor, + statistics, + options, + )?; let iter = std::iter::once(Ok(dict_page)).chain(std::iter::once(Ok(data_page))); Ok(DynIter::new(Box::new(iter))) diff --git a/src/io/parquet/write/file.rs b/src/io/parquet/write/file.rs index 47f595a1717..27c62f6edf5 100644 --- a/src/io/parquet/write/file.rs +++ b/src/io/parquet/write/file.rs @@ -1,13 +1,14 @@ use std::io::Write; +use parquet2::metadata::KeyValue; use parquet2::metadata::SchemaDescriptor; use parquet2::write::RowGroupIter; -use parquet2::{metadata::KeyValue, write::WriteOptions}; +use parquet2::write::WriteOptions as FileWriteOptions; use crate::datatypes::Schema; use crate::error::{ArrowError, Result}; -use super::{schema::schema_to_metadata_key, to_parquet_schema}; +use super::{schema::schema_to_metadata_key, to_parquet_schema, WriteOptions}; /// Attaches [`Schema`] to `key_value_metadata` pub fn add_arrow_schema( @@ -26,13 +27,14 @@ pub fn add_arrow_schema( pub struct FileWriter { writer: parquet2::write::FileWriter, schema: Schema, + options: WriteOptions, } // Accessors impl FileWriter { /// The options assigned to the file - pub fn options(&self) -> &WriteOptions { - self.writer.options() + pub fn options(&self) -> WriteOptions { + self.options } /// The [`SchemaDescriptor`] assigned to this file @@ -56,8 +58,17 @@ impl FileWriter { let created_by = Some("Arrow2 - Native Rust implementation of Arrow".to_string()); Ok(Self { - writer: parquet2::write::FileWriter::new(writer, parquet_schema, options, created_by), + writer: parquet2::write::FileWriter::new( + writer, + parquet_schema, + FileWriteOptions { + version: options.version, + write_statistics: options.write_statistics, + }, + created_by, + ), schema, + options, }) } @@ -67,17 +78,18 @@ impl FileWriter { } /// Writes a row group to the file. - pub fn write( - &mut self, - row_group: RowGroupIter<'_, ArrowError>, - num_rows: usize, - ) -> Result<()> { - Ok(self.writer.write(row_group, num_rows)?) + pub fn write(&mut self, row_group: RowGroupIter<'_, ArrowError>) -> Result<()> { + Ok(self.writer.write(row_group)?) } /// Writes the footer of the parquet file. Returns the total size of the file. - pub fn end(self, key_value_metadata: Option>) -> Result<(u64, W)> { + pub fn end(&mut self, key_value_metadata: Option>) -> Result { let key_value_metadata = add_arrow_schema(&self.schema, key_value_metadata); Ok(self.writer.end(key_value_metadata)?) } + + /// Consumes this writer and returns the inner writer + pub fn into_inner(self) -> W { + self.writer.into_inner() + } } diff --git a/src/io/parquet/write/fixed_len_bytes.rs b/src/io/parquet/write/fixed_len_bytes.rs index e129ab66c46..59ae75134e8 100644 --- a/src/io/parquet/write/fixed_len_bytes.rs +++ b/src/io/parquet/write/fixed_len_bytes.rs @@ -1,16 +1,16 @@ use parquet2::{ encoding::Encoding, - metadata::ColumnDescriptor, + metadata::Descriptor, page::DataPage, - statistics::{deserialize_statistics, serialize_statistics, ParquetStatistics}, - write::WriteOptions, + schema::types::PrimitiveType, + statistics::{serialize_statistics, FixedLenStatistics, ParquetStatistics, Statistics}, }; -use super::{binary::ord_binary, utils}; +use super::{binary::ord_binary, utils, WriteOptions}; use crate::{ array::{Array, FixedSizeBinaryArray}, error::Result, - io::parquet::read::is_type_nullable, + io::parquet::read::schema::is_nullable, }; pub(crate) fn encode_plain(array: &FixedSizeBinaryArray, is_optional: bool, buffer: &mut Vec) { @@ -29,9 +29,9 @@ pub(crate) fn encode_plain(array: &FixedSizeBinaryArray, is_optional: bool, buff pub fn array_to_page( array: &FixedSizeBinaryArray, options: WriteOptions, - descriptor: ColumnDescriptor, + descriptor: Descriptor, ) -> Result { - let is_optional = is_type_nullable(descriptor.type_()); + let is_optional = is_nullable(&descriptor.primitive_type.field_info); let validity = array.validity(); let mut buffer = vec![]; @@ -48,7 +48,7 @@ pub fn array_to_page( encode_plain(array, is_optional, &mut buffer); let statistics = if options.write_statistics { - build_statistics(array, descriptor.clone()) + Some(build_statistics(array, descriptor.primitive_type.clone())) } else { None }; @@ -56,6 +56,7 @@ pub fn array_to_page( utils::build_plain_page( buffer, array.len(), + array.len(), array.null_count(), 0, definition_levels_byte_length, @@ -68,11 +69,10 @@ pub fn array_to_page( pub(super) fn build_statistics( array: &FixedSizeBinaryArray, - descriptor: ColumnDescriptor, -) -> Option { - let pq_statistics = &ParquetStatistics { - max: None, - min: None, + primitive_type: PrimitiveType, +) -> ParquetStatistics { + let statistics = &FixedLenStatistics { + primitive_type, null_count: Some(array.null_count() as i64), distinct_count: None, max_value: array @@ -85,8 +85,6 @@ pub(super) fn build_statistics( .flatten() .min_by(|x, y| ord_binary(x, y)) .map(|x| x.to_vec()), - }; - deserialize_statistics(pq_statistics, descriptor) - .map(|e| serialize_statistics(&*e)) - .ok() + } as &dyn Statistics; + serialize_statistics(statistics) } diff --git a/src/io/parquet/write/mod.rs b/src/io/parquet/write/mod.rs index 7c31f27fc52..07bf6211edd 100644 --- a/src/io/parquet/write/mod.rs +++ b/src/io/parquet/write/mod.rs @@ -16,7 +16,7 @@ use crate::array::*; use crate::bitmap::Bitmap; use crate::datatypes::*; use crate::error::{ArrowError, Result}; -use crate::io::parquet::read::is_type_nullable; +use crate::io::parquet::read::schema::is_nullable; use crate::io::parquet::write::levels::NestedInfo; use crate::types::days_ms; use crate::types::NativeType; @@ -26,15 +26,24 @@ pub use parquet2::{ compression::Compression, encoding::Encoding, fallible_streaming_iterator, - metadata::{ColumnDescriptor, KeyValue, SchemaDescriptor}, + metadata::{Descriptor, KeyValue, SchemaDescriptor}, page::{CompressedDataPage, CompressedPage, EncodedPage}, schema::types::ParquetType, - write::{ - compress, Compressor, DynIter, DynStreamingIterator, RowGroupIter, Version, WriteOptions, - }, + write::{compress, Compressor, DynIter, DynStreamingIterator, RowGroupIter, Version}, FallibleStreamingIterator, }; +/// Currently supported options to write to parquet +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct WriteOptions { + /// Whether to write statistics + pub write_statistics: bool, + /// The page and file version to use + pub version: Version, + /// The compression to apply to every page + pub compression: Compression, +} + pub use file::FileWriter; pub use row_group::{row_group_iter, RowGroupIterator}; pub use schema::to_parquet_type; @@ -80,7 +89,7 @@ pub fn can_encode(data_type: &DataType, encoding: Encoding) -> bool { /// Returns an iterator of [`EncodedPage`]. pub fn array_to_pages( array: &dyn Array, - descriptor: ColumnDescriptor, + descriptor: Descriptor, options: WriteOptions, encoding: Encoding, ) -> Result>> { @@ -103,7 +112,7 @@ pub fn array_to_pages( /// Converts an [`Array`] to a [`CompressedPage`] based on options, descriptor and `encoding`. pub fn array_to_page( array: &dyn Array, - descriptor: ColumnDescriptor, + descriptor: Descriptor, options: WriteOptions, encoding: Encoding, ) -> Result { @@ -316,11 +325,11 @@ fn list_array_to_page( offsets: &[O], validity: Option<&Bitmap>, values: &dyn Array, - descriptor: ColumnDescriptor, + descriptor: Descriptor, options: WriteOptions, ) -> Result { use DataType::*; - let is_optional = is_type_nullable(descriptor.type_()); + let is_optional = is_nullable(&descriptor.primitive_type.field_info); let nested = NestedInfo::new(offsets, validity, is_optional); match values.data_type() { @@ -347,47 +356,19 @@ fn list_array_to_page( Utf8 => { let values = values.as_any().downcast_ref().unwrap(); - let is_optional = is_type_nullable(descriptor.type_()); - - utf8::nested_array_to_page::( - values, - options, - descriptor, - NestedInfo::new(offsets, validity, is_optional), - ) + utf8::nested_array_to_page::(values, options, descriptor, nested) } LargeUtf8 => { let values = values.as_any().downcast_ref().unwrap(); - let is_optional = is_type_nullable(descriptor.type_()); - - utf8::nested_array_to_page::( - values, - options, - descriptor, - NestedInfo::new(offsets, validity, is_optional), - ) + utf8::nested_array_to_page::(values, options, descriptor, nested) } Binary => { let values = values.as_any().downcast_ref().unwrap(); - let is_optional = is_type_nullable(descriptor.type_()); - - binary::nested_array_to_page::( - values, - options, - descriptor, - NestedInfo::new(offsets, validity, is_optional), - ) + binary::nested_array_to_page::(values, options, descriptor, nested) } LargeBinary => { let values = values.as_any().downcast_ref().unwrap(); - let is_optional = is_type_nullable(descriptor.type_()); - - binary::nested_array_to_page::( - values, - options, - descriptor, - NestedInfo::new(offsets, validity, is_optional), - ) + binary::nested_array_to_page::(values, options, descriptor, nested) } _ => todo!(), } @@ -395,7 +376,7 @@ fn list_array_to_page( fn nested_array_to_page( array: &dyn Array, - descriptor: ColumnDescriptor, + descriptor: Descriptor, options: WriteOptions, ) -> Result { match array.data_type() { diff --git a/src/io/parquet/write/primitive/basic.rs b/src/io/parquet/write/primitive/basic.rs index 9b9deb16d0b..1c58804fa1a 100644 --- a/src/io/parquet/write/primitive/basic.rs +++ b/src/io/parquet/write/primitive/basic.rs @@ -1,17 +1,18 @@ use parquet2::{ encoding::Encoding, - metadata::ColumnDescriptor, + metadata::Descriptor, page::DataPage, + schema::types::PrimitiveType, statistics::{serialize_statistics, ParquetStatistics, PrimitiveStatistics, Statistics}, types::NativeType, - write::WriteOptions, }; use super::super::utils; +use super::super::WriteOptions; use crate::{ array::{Array, PrimitiveArray}, error::Result, - io::parquet::read::is_type_nullable, + io::parquet::read::schema::is_nullable, types::NativeType as ArrowNativeType, }; @@ -41,14 +42,14 @@ where pub fn array_to_page( array: &PrimitiveArray, options: WriteOptions, - descriptor: ColumnDescriptor, + descriptor: Descriptor, ) -> Result where T: ArrowNativeType, R: NativeType, T: num_traits::AsPrimitive, { - let is_optional = is_type_nullable(descriptor.type_()); + let is_optional = is_nullable(&descriptor.primitive_type.field_info); let validity = array.validity(); @@ -66,7 +67,7 @@ where encode_plain(array, is_optional, &mut buffer); let statistics = if options.write_statistics { - Some(build_statistics(array, descriptor.clone())) + Some(build_statistics(array, descriptor.primitive_type.clone())) } else { None }; @@ -74,6 +75,7 @@ where utils::build_plain_page( buffer, array.len(), + array.len(), array.null_count(), 0, definition_levels_byte_length, @@ -86,7 +88,7 @@ where pub fn build_statistics( array: &PrimitiveArray, - descriptor: ColumnDescriptor, + primitive_type: PrimitiveType, ) -> ParquetStatistics where T: ArrowNativeType, @@ -94,7 +96,7 @@ where T: num_traits::AsPrimitive, { let statistics = &PrimitiveStatistics:: { - descriptor, + primitive_type, null_count: Some(array.null_count() as i64), distinct_count: None, max_value: array diff --git a/src/io/parquet/write/primitive/mod.rs b/src/io/parquet/write/primitive/mod.rs index ddeb6541605..eec1d695d1d 100644 --- a/src/io/parquet/write/primitive/mod.rs +++ b/src/io/parquet/write/primitive/mod.rs @@ -2,5 +2,6 @@ mod basic; mod nested; pub use basic::array_to_page; +pub(crate) use basic::build_statistics; pub(crate) use basic::encode_plain; pub use nested::array_to_page as nested_array_to_page; diff --git a/src/io/parquet/write/primitive/nested.rs b/src/io/parquet/write/primitive/nested.rs index 5be103d08b9..86732fdae97 100644 --- a/src/io/parquet/write/primitive/nested.rs +++ b/src/io/parquet/write/primitive/nested.rs @@ -1,22 +1,20 @@ -use parquet2::{ - encoding::Encoding, metadata::ColumnDescriptor, page::DataPage, types::NativeType, - write::WriteOptions, -}; +use parquet2::{encoding::Encoding, metadata::Descriptor, page::DataPage, types::NativeType}; use super::super::levels; use super::super::utils; +use super::super::WriteOptions; use super::basic::{build_statistics, encode_plain}; +use crate::io::parquet::read::schema::is_nullable; use crate::{ array::{Array, Offset, PrimitiveArray}, error::Result, - io::parquet::read::is_type_nullable, types::NativeType as ArrowNativeType, }; pub fn array_to_page( array: &PrimitiveArray, options: WriteOptions, - descriptor: ColumnDescriptor, + descriptor: Descriptor, nested: levels::NestedInfo, ) -> Result where @@ -25,7 +23,7 @@ where T: num_traits::AsPrimitive, O: Offset, { - let is_optional = is_type_nullable(descriptor.type_()); + let is_optional = is_nullable(&descriptor.primitive_type.field_info); let validity = array.validity(); @@ -39,7 +37,7 @@ where encode_plain(array, is_optional, &mut buffer); let statistics = if options.write_statistics { - Some(build_statistics(array, descriptor.clone())) + Some(build_statistics(array, descriptor.primitive_type.clone())) } else { None }; @@ -47,6 +45,7 @@ where utils::build_plain_page( buffer, levels::num_values(nested.offsets()), + nested.offsets().len().saturating_sub(1), array.null_count(), repetition_levels_byte_length, definition_levels_byte_length, diff --git a/src/io/parquet/write/row_group.rs b/src/io/parquet/write/row_group.rs index f6076808ac5..5c419640395 100644 --- a/src/io/parquet/write/row_group.rs +++ b/src/io/parquet/write/row_group.rs @@ -28,13 +28,15 @@ pub fn row_group_iter + 'static + Send + Sync>( .zip(columns.into_iter()) .zip(encodings.into_iter()) .map(move |((array, descriptor), encoding)| { - array_to_pages(array.as_ref(), descriptor, options, encoding).map(move |pages| { - let encoded_pages = DynIter::new(pages.map(|x| Ok(x?))); - let compressed_pages = - Compressor::new(encoded_pages, options.compression, vec![]) - .map_err(ArrowError::from); - DynStreamingIterator::new(compressed_pages) - }) + array_to_pages(array.as_ref(), descriptor.descriptor, options, encoding).map( + move |pages| { + let encoded_pages = DynIter::new(pages.map(|x| Ok(x?))); + let compressed_pages = + Compressor::new(encoded_pages, options.compression, vec![]) + .map_err(ArrowError::from); + DynStreamingIterator::new(compressed_pages) + }, + ) }), ) } @@ -78,23 +80,19 @@ impl + 'static, I: Iterator>>> RowGro impl + 'static + Send + Sync, I: Iterator>>> Iterator for RowGroupIterator { - type Item = Result<(RowGroupIter<'static, ArrowError>, usize)>; + type Item = Result>; fn next(&mut self) -> Option { let options = self.options; self.iter.next().map(|maybe_chunk| { let chunk = maybe_chunk?; - let len = chunk.len(); let encodings = self.encodings.clone(); - Ok(( - row_group_iter( - chunk, - encodings, - self.parquet_schema.columns().to_vec(), - options, - ), - len, + Ok(row_group_iter( + chunk, + encodings, + self.parquet_schema.columns().to_vec(), + options, )) }) } diff --git a/src/io/parquet/write/schema.rs b/src/io/parquet/write/schema.rs index ea05e7e19cd..7bf5e48b6cd 100644 --- a/src/io/parquet/write/schema.rs +++ b/src/io/parquet/write/schema.rs @@ -2,8 +2,8 @@ use parquet2::{ metadata::KeyValue, schema::{ types::{ - DecimalType, IntType, LogicalType, ParquetType, PhysicalType, PrimitiveConvertedType, - TimeType, TimeUnit as ParquetTimeUnit, TimestampType, + GroupLogicalType, IntegerType, ParquetType, PhysicalType, PrimitiveConvertedType, + PrimitiveLogicalType, TimeUnit as ParquetTimeUnit, }, Repetition, }, @@ -53,7 +53,7 @@ pub fn to_parquet_type(field: &Field) -> Result { PhysicalType::Int32, repetition, None, - Some(LogicalType::UNKNOWN(Default::default())), + Some(PrimitiveLogicalType::Unknown), None, )?), DataType::Boolean => Ok(ParquetType::try_from_primitive( @@ -120,7 +120,7 @@ pub fn to_parquet_type(field: &Field) -> Result { PhysicalType::ByteArray, repetition, Some(PrimitiveConvertedType::Utf8), - Some(LogicalType::STRING(Default::default())), + Some(PrimitiveLogicalType::String), None, )?), DataType::Date32 => Ok(ParquetType::try_from_primitive( @@ -128,7 +128,7 @@ pub fn to_parquet_type(field: &Field) -> Result { PhysicalType::Int32, repetition, Some(PrimitiveConvertedType::Date), - Some(LogicalType::DATE(Default::default())), + Some(PrimitiveLogicalType::Date), None, )?), DataType::Int8 => Ok(ParquetType::try_from_primitive( @@ -136,10 +136,7 @@ pub fn to_parquet_type(field: &Field) -> Result { PhysicalType::Int32, repetition, Some(PrimitiveConvertedType::Int8), - Some(LogicalType::INTEGER(IntType { - bit_width: 8, - is_signed: true, - })), + Some(PrimitiveLogicalType::Integer(IntegerType::Int8)), None, )?), DataType::Int16 => Ok(ParquetType::try_from_primitive( @@ -147,10 +144,7 @@ pub fn to_parquet_type(field: &Field) -> Result { PhysicalType::Int32, repetition, Some(PrimitiveConvertedType::Int16), - Some(LogicalType::INTEGER(IntType { - bit_width: 16, - is_signed: true, - })), + Some(PrimitiveLogicalType::Integer(IntegerType::Int16)), None, )?), DataType::UInt8 => Ok(ParquetType::try_from_primitive( @@ -158,10 +152,7 @@ pub fn to_parquet_type(field: &Field) -> Result { PhysicalType::Int32, repetition, Some(PrimitiveConvertedType::Uint8), - Some(LogicalType::INTEGER(IntType { - bit_width: 8, - is_signed: false, - })), + Some(PrimitiveLogicalType::Integer(IntegerType::UInt8)), None, )?), DataType::UInt16 => Ok(ParquetType::try_from_primitive( @@ -169,10 +160,7 @@ pub fn to_parquet_type(field: &Field) -> Result { PhysicalType::Int32, repetition, Some(PrimitiveConvertedType::Uint16), - Some(LogicalType::INTEGER(IntType { - bit_width: 16, - is_signed: false, - })), + Some(PrimitiveLogicalType::Integer(IntegerType::UInt16)), None, )?), DataType::UInt32 => Ok(ParquetType::try_from_primitive( @@ -180,10 +168,7 @@ pub fn to_parquet_type(field: &Field) -> Result { PhysicalType::Int32, repetition, Some(PrimitiveConvertedType::Uint32), - Some(LogicalType::INTEGER(IntType { - bit_width: 32, - is_signed: false, - })), + Some(PrimitiveLogicalType::Integer(IntegerType::UInt32)), None, )?), DataType::UInt64 => Ok(ParquetType::try_from_primitive( @@ -191,10 +176,7 @@ pub fn to_parquet_type(field: &Field) -> Result { PhysicalType::Int64, repetition, Some(PrimitiveConvertedType::Uint64), - Some(LogicalType::INTEGER(IntType { - bit_width: 64, - is_signed: false, - })), + Some(PrimitiveLogicalType::Integer(IntegerType::UInt64)), None, )?), // no natural representation in parquet; leave it as is. @@ -212,15 +194,15 @@ pub fn to_parquet_type(field: &Field) -> Result { PhysicalType::Int64, repetition, None, - Some(LogicalType::TIMESTAMP(TimestampType { - is_adjusted_to_u_t_c: matches!(zone, Some(z) if !z.as_str().is_empty()), + Some(PrimitiveLogicalType::Timestamp { + is_adjusted_to_utc: matches!(zone, Some(z) if !z.as_str().is_empty()), unit: match time_unit { TimeUnit::Second => unreachable!(), - TimeUnit::Millisecond => ParquetTimeUnit::MILLIS(Default::default()), - TimeUnit::Microsecond => ParquetTimeUnit::MICROS(Default::default()), - TimeUnit::Nanosecond => ParquetTimeUnit::NANOS(Default::default()), + TimeUnit::Millisecond => ParquetTimeUnit::Milliseconds, + TimeUnit::Microsecond => ParquetTimeUnit::Microseconds, + TimeUnit::Nanosecond => ParquetTimeUnit::Nanoseconds, }, - })), + }), None, )?), // no natural representation in parquet; leave it as is. @@ -238,10 +220,10 @@ pub fn to_parquet_type(field: &Field) -> Result { PhysicalType::Int32, repetition, Some(PrimitiveConvertedType::TimeMillis), - Some(LogicalType::TIME(TimeType { - is_adjusted_to_u_t_c: false, - unit: ParquetTimeUnit::MILLIS(Default::default()), - })), + Some(PrimitiveLogicalType::Time { + is_adjusted_to_utc: false, + unit: ParquetTimeUnit::Milliseconds, + }), None, )?), DataType::Time64(time_unit) => Ok(ParquetType::try_from_primitive( @@ -253,14 +235,14 @@ pub fn to_parquet_type(field: &Field) -> Result { TimeUnit::Nanosecond => None, _ => unreachable!(), }, - Some(LogicalType::TIME(TimeType { - is_adjusted_to_u_t_c: false, + Some(PrimitiveLogicalType::Time { + is_adjusted_to_utc: false, unit: match time_unit { - TimeUnit::Microsecond => ParquetTimeUnit::MICROS(Default::default()), - TimeUnit::Nanosecond => ParquetTimeUnit::NANOS(Default::default()), + TimeUnit::Microsecond => ParquetTimeUnit::Microseconds, + TimeUnit::Nanosecond => ParquetTimeUnit::Nanoseconds, _ => unreachable!(), }, - })), + }), None, )?), DataType::Struct(fields) => { @@ -274,9 +256,9 @@ pub fn to_parquet_type(field: &Field) -> Result { .iter() .map(to_parquet_type) .collect::>>()?; - Ok(ParquetType::try_from_group( + Ok(ParquetType::from_group( name, repetition, None, None, fields, None, - )?) + )) } DataType::Dictionary(_, value, _) => { let dict_field = Field::new(name.as_str(), value.as_ref().clone(), field.is_nullable); @@ -284,7 +266,7 @@ pub fn to_parquet_type(field: &Field) -> Result { } DataType::FixedSizeBinary(size) => Ok(ParquetType::try_from_primitive( name, - PhysicalType::FixedLenByteArray(*size as i32), + PhysicalType::FixedLenByteArray(*size), repetition, None, None, @@ -293,27 +275,21 @@ pub fn to_parquet_type(field: &Field) -> Result { DataType::Decimal(precision, scale) => { let precision = *precision; let scale = *scale; - let logical_type = Some(LogicalType::DECIMAL(DecimalType { - scale: scale as i32, - precision: precision as i32, - })); + let logical_type = Some(PrimitiveLogicalType::Decimal(precision, scale)); let physical_type = if precision <= 9 { PhysicalType::Int32 } else if precision <= 18 { PhysicalType::Int64 } else { - let len = decimal_length_from_precision(precision) as i32; + let len = decimal_length_from_precision(precision); PhysicalType::FixedLenByteArray(len) }; Ok(ParquetType::try_from_primitive( name, physical_type, repetition, - Some(PrimitiveConvertedType::Decimal( - precision as i32, - scale as i32, - )), + Some(PrimitiveConvertedType::Decimal(precision, scale)), logical_type, None, )?) @@ -327,21 +303,21 @@ pub fn to_parquet_type(field: &Field) -> Result { None, )?), DataType::List(f) | DataType::FixedSizeList(f, _) | DataType::LargeList(f) => { - Ok(ParquetType::try_from_group( + Ok(ParquetType::from_group( name, repetition, None, - Some(LogicalType::LIST(Default::default())), - vec![ParquetType::try_from_group( + Some(GroupLogicalType::List), + vec![ParquetType::from_group( "list".to_string(), Repetition::Repeated, None, None, vec![to_parquet_type(f)?], None, - )?], + )], None, - )?) + )) } other => Err(ArrowError::NotYetImplemented(format!( "Writing the data type {:?} is not yet implemented", diff --git a/src/io/parquet/write/sink.rs b/src/io/parquet/write/sink.rs index 8906be9431b..1c483c4ed8b 100644 --- a/src/io/parquet/write/sink.rs +++ b/src/io/parquet/write/sink.rs @@ -1,16 +1,14 @@ -use crate::{ - array::Array, - chunk::Chunk, - datatypes::Schema, - error::ArrowError, - io::parquet::write::{Encoding, SchemaDescriptor, WriteOptions}, -}; +use std::{collections::HashMap, pin::Pin, sync::Arc, task::Poll}; + use futures::{future::BoxFuture, AsyncWrite, FutureExt, Sink, TryFutureExt}; use parquet2::metadata::KeyValue; use parquet2::write::FileStreamer; -use std::{collections::HashMap, pin::Pin, sync::Arc, task::Poll}; +use parquet2::write::WriteOptions as ParquetWriteOptions; + +use crate::{array::Array, chunk::Chunk, datatypes::Schema, error::ArrowError}; use super::file::add_arrow_schema; +use super::{Encoding, SchemaDescriptor, WriteOptions}; /// Sink that writes array [`chunks`](Chunk) as a Parquet file. /// @@ -82,10 +80,17 @@ where encoding: Vec, options: WriteOptions, ) -> Result { - // let mut writer = FileStreamer::try_new(writer, schema.clone(), options)?; let parquet_schema = crate::io::parquet::write::to_parquet_schema(&schema)?; let created_by = Some("Arrow2 - Native Rust implementation of Arrow".to_string()); - let mut writer = FileStreamer::new(writer, parquet_schema.clone(), options, created_by); + let mut writer = FileStreamer::new( + writer, + parquet_schema.clone(), + ParquetWriteOptions { + version: options.version, + write_statistics: options.write_statistics, + }, + created_by, + ); let task = Some( async move { writer.start().await?; @@ -150,7 +155,6 @@ where fn start_send(self: Pin<&mut Self>, item: Chunk>) -> Result<(), Self::Error> { let this = self.get_mut(); if let Some(mut writer) = this.writer.take() { - let count = item.len(); let rows = crate::io::parquet::write::row_group_iter( item, this.encoding.clone(), @@ -158,7 +162,7 @@ where this.options, ); this.task = Some(Box::pin(async move { - writer.write(rows, count).await?; + writer.write(rows).await?; Ok(Some(writer)) })); Ok(()) diff --git a/src/io/parquet/write/utf8/basic.rs b/src/io/parquet/write/utf8/basic.rs index f1e8fd3d24c..3074e6fe738 100644 --- a/src/io/parquet/write/utf8/basic.rs +++ b/src/io/parquet/write/utf8/basic.rs @@ -1,17 +1,18 @@ use parquet2::{ encoding::Encoding, - metadata::ColumnDescriptor, + metadata::Descriptor, page::DataPage, + schema::types::PrimitiveType, statistics::{serialize_statistics, BinaryStatistics, ParquetStatistics, Statistics}, - write::WriteOptions, }; use super::super::binary::{encode_delta, ord_binary}; use super::super::utils; +use super::super::WriteOptions; use crate::{ array::{Array, Offset, Utf8Array}, error::{ArrowError, Result}, - io::parquet::read::is_type_nullable, + io::parquet::read::schema::is_nullable, }; pub(crate) fn encode_plain( @@ -41,11 +42,11 @@ pub(crate) fn encode_plain( pub fn array_to_page( array: &Utf8Array, options: WriteOptions, - descriptor: ColumnDescriptor, + descriptor: Descriptor, encoding: Encoding, ) -> Result { let validity = array.validity(); - let is_optional = is_type_nullable(descriptor.type_()); + let is_optional = is_nullable(&descriptor.primitive_type.field_info); let mut buffer = vec![]; utils::write_def_levels( @@ -77,7 +78,7 @@ pub fn array_to_page( } let statistics = if options.write_statistics { - Some(build_statistics(array, descriptor.clone())) + Some(build_statistics(array, descriptor.primitive_type.clone())) } else { None }; @@ -85,6 +86,7 @@ pub fn array_to_page( utils::build_plain_page( buffer, array.len(), + array.len(), array.null_count(), 0, definition_levels_byte_length, @@ -95,12 +97,12 @@ pub fn array_to_page( ) } -pub(super) fn build_statistics( +pub(crate) fn build_statistics( array: &Utf8Array, - descriptor: ColumnDescriptor, + primitive_type: PrimitiveType, ) -> ParquetStatistics { let statistics = &BinaryStatistics { - descriptor, + primitive_type, null_count: Some(array.null_count() as i64), distinct_count: None, max_value: array diff --git a/src/io/parquet/write/utf8/mod.rs b/src/io/parquet/write/utf8/mod.rs index ddeb6541605..eec1d695d1d 100644 --- a/src/io/parquet/write/utf8/mod.rs +++ b/src/io/parquet/write/utf8/mod.rs @@ -2,5 +2,6 @@ mod basic; mod nested; pub use basic::array_to_page; +pub(crate) use basic::build_statistics; pub(crate) use basic::encode_plain; pub use nested::array_to_page as nested_array_to_page; diff --git a/src/io/parquet/write/utf8/nested.rs b/src/io/parquet/write/utf8/nested.rs index cb87fabf31f..32b83cc0d7e 100644 --- a/src/io/parquet/write/utf8/nested.rs +++ b/src/io/parquet/write/utf8/nested.rs @@ -1,26 +1,24 @@ -use parquet2::{ - encoding::Encoding, metadata::ColumnDescriptor, page::DataPage, write::WriteOptions, -}; +use parquet2::{encoding::Encoding, metadata::Descriptor, page::DataPage}; -use super::super::{levels, utils}; +use super::super::{levels, utils, WriteOptions}; use super::basic::{build_statistics, encode_plain}; +use crate::io::parquet::read::schema::is_nullable; use crate::{ array::{Array, Offset, Utf8Array}, error::Result, - io::parquet::read::is_type_nullable, }; pub fn array_to_page( array: &Utf8Array, options: WriteOptions, - descriptor: ColumnDescriptor, + descriptor: Descriptor, nested: levels::NestedInfo, ) -> Result where OO: Offset, O: Offset, { - let is_optional = is_type_nullable(descriptor.type_()); + let is_optional = is_nullable(&descriptor.primitive_type.field_info); let validity = array.validity(); @@ -34,7 +32,7 @@ where encode_plain(array, is_optional, &mut buffer); let statistics = if options.write_statistics { - Some(build_statistics(array, descriptor.clone())) + Some(build_statistics(array, descriptor.primitive_type.clone())) } else { None }; @@ -42,6 +40,7 @@ where utils::build_plain_page( buffer, levels::num_values(nested.offsets()), + nested.offsets().len().saturating_sub(1), array.null_count(), repetition_levels_byte_length, definition_levels_byte_length, diff --git a/src/io/parquet/write/utils.rs b/src/io/parquet/write/utils.rs index 6857bbc533f..851034d6ae9 100644 --- a/src/io/parquet/write/utils.rs +++ b/src/io/parquet/write/utils.rs @@ -3,12 +3,12 @@ use crate::bitmap::Bitmap; use parquet2::{ compression::Compression, encoding::{hybrid_rle::encode_bool, Encoding}, - metadata::ColumnDescriptor, + metadata::Descriptor, page::{DataPage, DataPageHeader, DataPageHeaderV1, DataPageHeaderV2}, statistics::ParquetStatistics, - write::WriteOptions, }; +use super::WriteOptions; use crate::error::Result; use super::Version; @@ -60,42 +60,42 @@ pub fn write_def_levels( #[allow(clippy::too_many_arguments)] pub fn build_plain_page( buffer: Vec, - len: usize, + num_values: usize, + num_rows: usize, null_count: usize, repetition_levels_byte_length: usize, definition_levels_byte_length: usize, statistics: Option, - descriptor: ColumnDescriptor, + descriptor: Descriptor, options: WriteOptions, encoding: Encoding, ) -> Result { - match options.version { - Version::V1 => { - let header = DataPageHeader::V1(DataPageHeaderV1 { - num_values: len as i32, - encoding: encoding.into(), - definition_level_encoding: Encoding::Rle.into(), - repetition_level_encoding: Encoding::Rle.into(), - statistics, - }); - - Ok(DataPage::new(header, buffer, None, descriptor)) - } - Version::V2 => { - let header = DataPageHeader::V2(DataPageHeaderV2 { - num_values: len as i32, - encoding: encoding.into(), - num_nulls: null_count as i32, - num_rows: len as i32, - definition_levels_byte_length: definition_levels_byte_length as i32, - repetition_levels_byte_length: repetition_levels_byte_length as i32, - is_compressed: Some(options.compression != Compression::Uncompressed), - statistics, - }); - - Ok(DataPage::new(header, buffer, None, descriptor)) - } - } + let header = match options.version { + Version::V1 => DataPageHeader::V1(DataPageHeaderV1 { + num_values: num_values as i32, + encoding: encoding.into(), + definition_level_encoding: Encoding::Rle.into(), + repetition_level_encoding: Encoding::Rle.into(), + statistics, + }), + Version::V2 => DataPageHeader::V2(DataPageHeaderV2 { + num_values: num_values as i32, + encoding: encoding.into(), + num_nulls: null_count as i32, + num_rows: num_rows as i32, + definition_levels_byte_length: definition_levels_byte_length as i32, + repetition_levels_byte_length: repetition_levels_byte_length as i32, + is_compressed: Some(options.compression != Compression::Uncompressed), + statistics, + }), + }; + Ok(DataPage::new( + header, + buffer, + None, + descriptor, + Some(num_rows), + )) } /// Auxiliary iterator adapter to declare the size hint of an iterator. diff --git a/tests/it/io/parquet/mod.rs b/tests/it/io/parquet/mod.rs index 5e1b59c7488..7c8daf23641 100644 --- a/tests/it/io/parquet/mod.rs +++ b/tests/it/io/parquet/mod.rs @@ -9,6 +9,7 @@ use arrow2::{ use crate::io::ipc::read_gzip_json; mod read; +mod read_indexes; mod write; mod write_async; @@ -22,12 +23,18 @@ pub fn read_column( let metadata = read_metadata(&mut reader)?; let schema = infer_schema(&metadata)?; + // verify that we can read indexes + let _indexes = read_columns_indexes( + &mut reader, + metadata.row_groups[0].columns(), + &schema.fields, + )?; + let column = schema .fields .iter() .enumerate() - .filter_map(|(i, f)| if f.name == column { Some(i) } else { None }) - .next() + .find_map(|(i, f)| if f.name == column { Some(i) } else { None }) .unwrap(); let mut reader = FileReader::try_new(reader, Some(&[column]), None, None, None)?; @@ -329,7 +336,7 @@ pub fn pyarrow_nullable(column: &str) -> Box { .collect::>(); Box::new(PrimitiveArray::::from(values)) } - "string_large" => { + "int32_dict" => { let keys = PrimitiveArray::::from([Some(0), Some(1), None, Some(1)]); let values = Arc::new(PrimitiveArray::::from_slice([10, 200])); Box::new(DictionaryArray::::from_data(keys, values)) @@ -413,7 +420,13 @@ pub fn pyarrow_nullable_statistics(column: &str) -> Option> min_value: Some(0), max_value: Some(9), }), - "string_large" => return None, + "int32_dict" => Box::new(PrimitiveStatistics { + data_type: DataType::Dictionary(IntegerType::Int32, Box::new(DataType::Int32), false), + null_count: Some(0), + distinct_count: None, + min_value: Some(10), + max_value: Some(200), + }), "decimal_9" => Box::new(PrimitiveStatistics:: { distinct_count: None, null_count: Some(3), @@ -716,12 +729,11 @@ fn integration_write(schema: &Schema, batches: &[Chunk>]) -> Resu writer.start()?; for group in row_groups { - let (group, len) = group?; - writer.write(group, len)?; + writer.write(group?)?; } - let (_size, writer) = writer.end(None)?; + writer.end(None)?; - Ok(writer.into_inner()) + Ok(writer.into_inner().into_inner()) } type IntegrationRead = (Schema, Vec>>); diff --git a/tests/it/io/parquet/read_indexes.rs b/tests/it/io/parquet/read_indexes.rs new file mode 100644 index 00000000000..ade8b268f16 --- /dev/null +++ b/tests/it/io/parquet/read_indexes.rs @@ -0,0 +1,223 @@ +use std::io::Cursor; +use std::sync::Arc; + +use arrow2::error::ArrowError; +use arrow2::{array::*, datatypes::*, error::Result, io::parquet::read::*, io::parquet::write::*}; +use parquet2::indexes::{compute_rows, select_pages}; +use parquet2::read::IndexedPageReader; + +/// Returns 2 sets of pages with different the same number of rows distributed un-evenly +fn pages( + arrays: &[&dyn Array], + encoding: Encoding, +) -> Result<(Vec, Vec, Schema)> { + // create pages with different number of rows + let array11 = PrimitiveArray::::from_slice([1, 2, 3, 4]); + let array12 = PrimitiveArray::::from_slice([5]); + let array13 = PrimitiveArray::::from_slice([6]); + + let schema = Schema::from(vec![ + Field::new("a1", DataType::Int64, false), + Field::new( + "a2", + arrays[0].data_type().clone(), + arrays.iter().map(|x| x.null_count()).sum::() != 0usize, + ), + ]); + + let parquet_schema = to_parquet_schema(&schema)?; + + let options = WriteOptions { + write_statistics: true, + compression: Compression::Uncompressed, + version: Version::V1, + }; + + let pages1 = vec![ + array_to_page( + &array11, + parquet_schema.columns()[0].descriptor.clone(), + options, + Encoding::Plain, + )?, + array_to_page( + &array12, + parquet_schema.columns()[0].descriptor.clone(), + options, + Encoding::Plain, + )?, + array_to_page( + &array13, + parquet_schema.columns()[0].descriptor.clone(), + options, + Encoding::Plain, + )?, + ]; + let pages2 = arrays + .iter() + .flat_map(|array| { + array_to_pages( + *array, + parquet_schema.columns()[1].descriptor.clone(), + options, + encoding, + ) + .unwrap() + .collect::>>() + .unwrap() + }) + .collect::>(); + + Ok((pages1, pages2, schema)) +} + +/// Tests reading pages while skipping indexes +fn read_with_indexes( + (pages1, pages2, schema): (Vec, Vec, Schema), + expected: Arc, +) -> Result<()> { + let options = WriteOptions { + write_statistics: true, + compression: Compression::Uncompressed, + version: Version::V1, + }; + + let to_compressed = |pages: Vec| { + let encoded_pages = DynIter::new(pages.into_iter().map(Ok)); + let compressed_pages = + Compressor::new(encoded_pages, options.compression, vec![]).map_err(ArrowError::from); + Result::Ok(DynStreamingIterator::new(compressed_pages)) + }; + + let row_group = DynIter::new(vec![to_compressed(pages1), to_compressed(pages2)].into_iter()); + + let writer = vec![]; + let mut writer = FileWriter::try_new(writer, schema, options)?; + + writer.start()?; + writer.write(row_group)?; + writer.end(None)?; + let data = writer.into_inner(); + + let mut reader = Cursor::new(data); + + let metadata = read_metadata(&mut reader)?; + + let schema = infer_schema(&metadata)?; + + let row_group = &metadata.row_groups[0]; + + let pages = read_pages_locations(&mut reader, row_group.columns())?; + + // say we concluded from the indexes that we only needed the "6" from the first column, so second page. + let _indexes = read_columns_indexes(&mut reader, row_group.columns(), &schema.fields)?; + let intervals = compute_rows(&[false, true, false], &pages[0], row_group.num_rows())?; + + // based on the intervals from c1, we compute which pages from the second column are required: + let pages = select_pages(&intervals, &pages[1], row_group.num_rows())?; + + // and read them: + let c1 = &metadata.row_groups[0].columns()[1]; + + let pages = IndexedPageReader::new(reader, c1, pages, vec![], vec![]); + let pages = BasicDecompressor::new(pages, vec![]); + + let arrays = column_iter_to_arrays( + vec![pages], + vec![&c1.descriptor().descriptor.primitive_type], + schema.fields[1].clone(), + row_group.num_rows() as usize, + )?; + + let arrays = arrays.collect::>>()?; + + assert_eq!(arrays, vec![expected]); + Ok(()) +} + +#[test] +fn indexed_required_utf8() -> Result<()> { + let array21 = Utf8Array::::from_slice(["a", "b", "c"]); + let array22 = Utf8Array::::from_slice(["d", "e", "f"]); + let expected = Arc::new(Utf8Array::::from_slice(["e"])) as Arc; + + read_with_indexes(pages(&[&array21, &array22], Encoding::Plain)?, expected) +} + +#[test] +fn indexed_required_i32() -> Result<()> { + let array21 = Int32Array::from_slice([1, 2, 3]); + let array22 = Int32Array::from_slice([4, 5, 6]); + let expected = Arc::new(Int32Array::from_slice([5])) as Arc; + + read_with_indexes(pages(&[&array21, &array22], Encoding::Plain)?, expected) +} + +#[test] +fn indexed_optional_i32() -> Result<()> { + let array21 = Int32Array::from([Some(1), Some(2), None]); + let array22 = Int32Array::from([None, Some(5), Some(6)]); + let expected = Arc::new(Int32Array::from_slice([5])) as Arc; + + read_with_indexes(pages(&[&array21, &array22], Encoding::Plain)?, expected) +} + +#[test] +fn indexed_optional_utf8() -> Result<()> { + let array21 = Utf8Array::::from([Some("a"), Some("b"), None]); + let array22 = Utf8Array::::from([None, Some("e"), Some("f")]); + let expected = Arc::new(Utf8Array::::from_slice(["e"])) as Arc; + + read_with_indexes(pages(&[&array21, &array22], Encoding::Plain)?, expected) +} + +#[test] +fn indexed_required_fixed_len() -> Result<()> { + let array21 = FixedSizeBinaryArray::from_slice([[127], [128], [129]]); + let array22 = FixedSizeBinaryArray::from_slice([[130], [131], [132]]); + let expected = Arc::new(FixedSizeBinaryArray::from_slice([[131]])) as Arc; + + read_with_indexes(pages(&[&array21, &array22], Encoding::Plain)?, expected) +} + +#[test] +fn indexed_optional_fixed_len() -> Result<()> { + let array21 = FixedSizeBinaryArray::from([Some([127]), Some([128]), None]); + let array22 = FixedSizeBinaryArray::from([None, Some([131]), Some([132])]); + let expected = Arc::new(FixedSizeBinaryArray::from_slice([[131]])) as Arc; + + read_with_indexes(pages(&[&array21, &array22], Encoding::Plain)?, expected) +} + +#[test] +fn indexed_required_boolean() -> Result<()> { + let array21 = BooleanArray::from_slice([true, false, true]); + let array22 = BooleanArray::from_slice([false, false, true]); + let expected = Arc::new(BooleanArray::from_slice([false])) as Arc; + + read_with_indexes(pages(&[&array21, &array22], Encoding::Plain)?, expected) +} + +#[test] +fn indexed_optional_boolean() -> Result<()> { + let array21 = BooleanArray::from([Some(true), Some(false), None]); + let array22 = BooleanArray::from([None, Some(false), Some(true)]); + let expected = Arc::new(BooleanArray::from_slice([false])) as Arc; + + read_with_indexes(pages(&[&array21, &array22], Encoding::Plain)?, expected) +} + +#[test] +fn indexed_dict() -> Result<()> { + let indices = PrimitiveArray::from_values((0..6u64).map(|x| x % 2)); + let values = PrimitiveArray::from_slice([4i32, 6i32]); + let array = DictionaryArray::from_data(indices, std::sync::Arc::new(values)); + + let indices = PrimitiveArray::from_slice(&[0u64]); + let values = PrimitiveArray::from_slice([4i32, 6i32]); + let expected = DictionaryArray::from_data(indices, std::sync::Arc::new(values)); + + let expected = Arc::new(expected) as Arc; + + read_with_indexes(pages(&[&array], Encoding::RleDictionary)?, expected) +} diff --git a/tests/it/io/parquet/write.rs b/tests/it/io/parquet/write.rs index c9141f4d515..06f3a706f02 100644 --- a/tests/it/io/parquet/write.rs +++ b/tests/it/io/parquet/write.rs @@ -49,12 +49,11 @@ fn round_trip( writer.start()?; for group in row_groups { - let (group, len) = group?; - writer.write(group, len)?; + writer.write(group?)?; } - let (_size, writer) = writer.end(None)?; + writer.end(None)?; - let data = writer.into_inner(); + let data = writer.into_inner().into_inner(); let (result, stats) = read_column(&mut Cursor::new(data), 0, "a1")?; assert_eq!(array.as_ref(), result.as_ref()); @@ -354,7 +353,7 @@ fn utf8_optional_v2_delta() -> Result<()> { #[test] fn i32_optional_v2_dict() -> Result<()> { round_trip( - "string_large", + "int32_dict", true, false, Version::V2, @@ -366,7 +365,7 @@ fn i32_optional_v2_dict() -> Result<()> { #[test] fn i32_optional_v2_dict_compressed() -> Result<()> { round_trip( - "string_large", + "int32_dict", true, false, Version::V2, diff --git a/tests/it/io/parquet/write_async.rs b/tests/it/io/parquet/write_async.rs index 5f9d09515e5..86ff5434df6 100644 --- a/tests/it/io/parquet/write_async.rs +++ b/tests/it/io/parquet/write_async.rs @@ -7,14 +7,10 @@ use arrow2::{ error::Result, io::parquet::{ read::{infer_schema, read_columns_many_async, read_metadata_async, RowGroupDeserializer}, - write::Encoding, + write::{Compression, Encoding, Version, WriteOptions}, }, }; use futures::{future::BoxFuture, io::Cursor, SinkExt}; -use parquet2::{ - compression::Compression, - write::{Version, WriteOptions}, -}; use super::FileSink;