diff --git a/src/io/parquet/read/deserialize/mod.rs b/src/io/parquet/read/deserialize/mod.rs index d3baa7879be..9f441872551 100644 --- a/src/io/parquet/read/deserialize/mod.rs +++ b/src/io/parquet/read/deserialize/mod.rs @@ -21,7 +21,7 @@ use crate::{ offset::Offsets, }; -use self::nested_utils::{InitNested, NestedArrayIter, NestedState}; +pub use self::nested_utils::{init_nested, InitNested, NestedArrayIter, NestedState}; use simple::page_iter_to_arrays; use super::*; @@ -43,7 +43,8 @@ pub fn get_page_iterator( )?) } -fn create_list( +/// Creates a new [`ListArray`] or [`FixedSizeListArray`]. +pub fn create_list( data_type: DataType, nested: &mut NestedState, values: Box, @@ -128,7 +129,7 @@ where } /// Returns the number of (parquet) columns that a [`DataType`] contains. -fn n_columns(data_type: &DataType) -> usize { +pub fn n_columns(data_type: &DataType) -> usize { use crate::datatypes::PhysicalType::*; match data_type.to_physical_type() { Null | Boolean | Primitive(_) | Binary | FixedSizeBinary | LargeBinary | Utf8 diff --git a/src/io/parquet/read/deserialize/nested_utils.rs b/src/io/parquet/read/deserialize/nested_utils.rs index f6f1a8baaa3..fef27bb0395 100644 --- a/src/io/parquet/read/deserialize/nested_utils.rs +++ b/src/io/parquet/read/deserialize/nested_utils.rs @@ -264,14 +264,19 @@ pub(super) trait NestedDecoder<'a> { fn deserialize_dict(&self, page: &DictPage) -> Self::Dictionary; } +/// The initial info of nested data types. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum InitNested { + /// Primitive data types Primitive(bool), + /// List data types List(bool), + /// Struct data types Struct(bool), } -fn init_nested(init: &[InitNested], capacity: usize) -> NestedState { +/// Initialize [`NestedState`] from `&[InitNested]`. +pub fn init_nested(init: &[InitNested], capacity: usize) -> NestedState { let container = init .iter() .map(|init| match init { @@ -324,12 +329,15 @@ impl<'a> NestedPage<'a> { } } +/// The state of nested data types. #[derive(Debug)] pub struct NestedState { + /// The nesteds composing `NestedState`. pub nested: Vec>, } impl NestedState { + /// Creates a new [`NestedState`]. pub fn new(nested: Vec>) -> Self { Self { nested } } diff --git a/src/io/parquet/read/mod.rs b/src/io/parquet/read/mod.rs index b8031a37580..58454978d17 100644 --- a/src/io/parquet/read/mod.rs +++ b/src/io/parquet/read/mod.rs @@ -34,7 +34,10 @@ pub use parquet2::{ use crate::{array::Array, error::Result}; -pub use deserialize::{column_iter_to_arrays, get_page_iterator}; +pub use deserialize::{ + column_iter_to_arrays, create_list, get_page_iterator, init_nested, n_columns, InitNested, + NestedState, +}; pub use file::{FileReader, RowGroupReader}; pub use row_group::*; pub use schema::{infer_schema, FileMetaData}; diff --git a/src/io/parquet/write/mod.rs b/src/io/parquet/write/mod.rs index 5bb0bd0f619..3fc13f74031 100644 --- a/src/io/parquet/write/mod.rs +++ b/src/io/parquet/write/mod.rs @@ -32,6 +32,8 @@ use crate::error::{Error, Result}; use crate::types::days_ms; use crate::types::NativeType; +pub use nested::write_rep_and_def; +pub use pages::{to_leaves, to_nested, to_parquet_leaves}; use parquet2::schema::types::PrimitiveType as ParquetPrimitiveType; pub use parquet2::{ compression::{BrotliLevel, CompressionOptions, GzipLevel, ZstdLevel}, @@ -46,6 +48,7 @@ pub use parquet2::{ }, FallibleStreamingIterator, }; +pub use utils::write_def_levels; /// Currently supported options to write to parquet #[derive(Debug, Clone, Copy, PartialEq, Eq)] @@ -70,7 +73,7 @@ pub use pages::array_to_columns; pub use pages::Nested; /// returns offset and length to slice the leaf values -pub(self) fn slice_nested_leaf(nested: &[Nested]) -> (usize, usize) { +pub fn slice_nested_leaf(nested: &[Nested]) -> (usize, usize) { // find the deepest recursive dremel structure as that one determines how many values we must // take let mut out = (0, 0); @@ -154,7 +157,8 @@ pub fn can_encode(data_type: &DataType, encoding: Encoding) -> bool { ) } -fn slice_parquet_array<'a>( +/// Slices the [`Array`] to `Box` and `Vec`. +pub fn slice_parquet_array<'a>( array: &'a dyn Array, nested: &'a [Nested<'a>], offset: usize, @@ -186,9 +190,9 @@ fn slice_parquet_array<'a>( } } -fn get_max_length(array: &dyn Array, nested: &[Nested]) -> usize { - // get the length that should be sliced. - // that is the inner nested structure that +/// Get the length of [`Array`] that should be sliced. +pub fn get_max_length(array: &dyn Array, nested: &[Nested]) -> usize { + // the inner nested structure that // dictates how often the primitive should be repeated for nested in nested.iter().rev() { match nested { diff --git a/src/io/parquet/write/nested/mod.rs b/src/io/parquet/write/nested/mod.rs index bd6747a1530..3629f0a6220 100644 --- a/src/io/parquet/write/nested/mod.rs +++ b/src/io/parquet/write/nested/mod.rs @@ -106,6 +106,7 @@ fn to_length( .map(|w| w[1].to_usize() - w[0].to_usize()) } +/// Write `repetition_levels` and `definition_levels` to buffer. pub fn write_rep_and_def( page_version: Version, nested: &[Nested], diff --git a/src/io/parquet/write/pages.rs b/src/io/parquet/write/pages.rs index 1e4bcbe9071..e7ddf8991c4 100644 --- a/src/io/parquet/write/pages.rs +++ b/src/io/parquet/write/pages.rs @@ -149,7 +149,8 @@ fn to_nested_recursive<'a>( Ok(()) } -fn to_leaves(array: &dyn Array) -> Vec<&dyn Array> { +/// Convert [`Array`] to `Vec<&dyn Array>` leaves in DFS order. +pub fn to_leaves(array: &dyn Array) -> Vec<&dyn Array> { let mut leaves = vec![]; to_leaves_recursive(array, &mut leaves); leaves @@ -179,7 +180,8 @@ fn to_leaves_recursive<'a>(array: &'a dyn Array, leaves: &mut Vec<&'a dyn Array> } } -fn to_parquet_leaves(type_: ParquetType) -> Vec { +/// Convert `ParquetType` to `Vec` leaves in DFS order. +pub fn to_parquet_leaves(type_: ParquetType) -> Vec { let mut leaves = vec![]; to_parquet_leaves_recursive(type_, &mut leaves); leaves