From 80b2be513362f51a6bf7e96095670af222d48e67 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Fri, 18 Mar 2022 04:54:55 +0000 Subject: [PATCH 01/16] Improved indexes --- examples/read_metadata.rs | 2 +- src/error.rs | 6 ++ src/indexes/index.rs | 192 ++++++++++++++++++++++++++++++++++++++ src/indexes/intervals.rs | 56 +++++++++++ src/indexes/mod.rs | 137 +++++++++++++++------------ src/indexes/read.rs | 83 ++++++++++++++++ src/parquet_bridge.rs | 43 +++++++++ src/types.rs | 2 +- 8 files changed, 461 insertions(+), 60 deletions(-) create mode 100644 src/indexes/index.rs create mode 100644 src/indexes/intervals.rs create mode 100644 src/indexes/read.rs diff --git a/examples/read_metadata.rs b/examples/read_metadata.rs index c7b5d2729..ed5246d4c 100644 --- a/examples/read_metadata.rs +++ b/examples/read_metadata.rs @@ -57,7 +57,7 @@ fn main() -> Result<()> { // ANCHOR: column_index // read the column index - let index = indexes::read_column(&mut reader, column_metadata.column_chunk())?; + let index = indexes::read_column(&mut reader, column_metadata)?; if let Some(index) = index { // these are the minimum and maximum within each page, which can be used // to skip pages. diff --git a/src/error.rs b/src/error.rs index fa0038401..18ebdfe7e 100644 --- a/src/error.rs +++ b/src/error.rs @@ -76,6 +76,12 @@ impl From for ParquetError { } } +impl From for ParquetError { + fn from(e: std::array::TryFromSliceError) -> ParquetError { + ParquetError::OutOfSpec(format!("Can't deserialize to parquet native type: {}", e)) + } +} + /// A specialized `Result` for Parquet errors. pub type Result = std::result::Result; diff --git a/src/indexes/index.rs b/src/indexes/index.rs new file mode 100644 index 000000000..38fb89d77 --- /dev/null +++ b/src/indexes/index.rs @@ -0,0 +1,192 @@ +use std::any::Any; + +use parquet_format_async_temp::ColumnIndex; + +use crate::parquet_bridge::BoundaryOrder; +use crate::{error::ParquetError, schema::types::PhysicalType, types::NativeType}; + +pub trait Index: Send + Sync + std::fmt::Debug { + fn as_any(&self) -> &dyn Any; + + fn physical_type(&self) -> &PhysicalType; +} + +/// An index of a column of [`NativeType`] physical representation +#[derive(Debug, Clone, PartialEq, Hash)] +pub struct NativeIndex { + pub indexes: Vec>, + pub boundary_order: BoundaryOrder, +} + +/// The index of a page, containing the min and max values of the page. +#[derive(Debug, Clone, PartialEq, Hash)] +pub struct PageIndex { + /// The minimum value in the page. It is None when all values are null + pub min: Option, + /// The maximum value in the page. It is None when all values are null + pub max: Option, + /// The number of null values in the page + pub null_count: Option, +} + +impl TryFrom for NativeIndex { + type Error = ParquetError; + + fn try_from(index: ColumnIndex) -> Result { + let len = index.min_values.len(); + + let null_counts = index + .null_counts + .map(|x| x.into_iter().map(Some).collect::>()) + .unwrap_or_else(|| vec![None; len]); + + let indexes = index + .min_values + .iter() + .zip(index.max_values.into_iter()) + .zip(index.null_pages.into_iter()) + .zip(null_counts.into_iter()) + .map(|(((min, max), is_null), null_count)| { + let (min, max) = if is_null { + (None, None) + } else { + let min = min.as_slice().try_into()?; + let max = max.as_slice().try_into()?; + (Some(T::from_le_bytes(min)), Some(T::from_le_bytes(max))) + }; + Ok(PageIndex { + min, + max, + null_count, + }) + }) + .collect::, ParquetError>>()?; + + Ok(Self { + indexes, + boundary_order: index.boundary_order.try_into()?, + }) + } +} + +impl Index for NativeIndex { + fn as_any(&self) -> &dyn Any { + self + } + + fn physical_type(&self) -> &PhysicalType { + &T::TYPE + } +} + +/// An index of a column of bytes physical type +#[derive(Debug, Clone, PartialEq, Hash)] +pub struct ByteIndex { + pub indexes: Vec>>, + pub boundary_order: BoundaryOrder, +} + +impl TryFrom for ByteIndex { + type Error = ParquetError; + + fn try_from(index: ColumnIndex) -> Result { + let len = index.min_values.len(); + + let null_counts = index + .null_counts + .map(|x| x.into_iter().map(Some).collect::>()) + .unwrap_or_else(|| vec![None; len]); + + let indexes = index + .min_values + .into_iter() + .zip(index.max_values.into_iter()) + .zip(index.null_pages.into_iter()) + .zip(null_counts.into_iter()) + .map(|(((min, max), is_null), null_count)| { + let (min, max) = if is_null { + (None, None) + } else { + (Some(min), Some(max)) + }; + Ok(PageIndex { + min, + max, + null_count, + }) + }) + .collect::, ParquetError>>()?; + + Ok(Self { + indexes, + boundary_order: index.boundary_order.try_into()?, + }) + } +} + +impl Index for ByteIndex { + fn as_any(&self) -> &dyn Any { + self + } + + fn physical_type(&self) -> &PhysicalType { + &PhysicalType::ByteArray + } +} + +/// An index of a column of fixed len byte physical type +#[derive(Debug, Clone, PartialEq, Hash)] +pub struct FixedLenByteIndex { + pub type_: PhysicalType, + pub indexes: Vec>>, + pub boundary_order: BoundaryOrder, +} + +impl TryFrom<(ColumnIndex, i32)> for FixedLenByteIndex { + type Error = ParquetError; + + fn try_from((index, size): (ColumnIndex, i32)) -> Result { + let len = index.min_values.len(); + + let null_counts = index + .null_counts + .map(|x| x.into_iter().map(Some).collect::>()) + .unwrap_or_else(|| vec![None; len]); + + let indexes = index + .min_values + .into_iter() + .zip(index.max_values.into_iter()) + .zip(index.null_pages.into_iter()) + .zip(null_counts.into_iter()) + .map(|(((min, max), is_null), null_count)| { + let (min, max) = if is_null { + (None, None) + } else { + (Some(min), Some(max)) + }; + Ok(PageIndex { + min, + max, + null_count, + }) + }) + .collect::, ParquetError>>()?; + + Ok(Self { + type_: PhysicalType::FixedLenByteArray(size), + indexes, + boundary_order: index.boundary_order.try_into()?, + }) + } +} + +impl Index for FixedLenByteIndex { + fn as_any(&self) -> &dyn Any { + self + } + + fn physical_type(&self) -> &PhysicalType { + &self.type_ + } +} diff --git a/src/indexes/intervals.rs b/src/indexes/intervals.rs new file mode 100644 index 000000000..4550a9b63 --- /dev/null +++ b/src/indexes/intervals.rs @@ -0,0 +1,56 @@ +use parquet_format_async_temp::PageLocation; + +use crate::error::ParquetError; + +use super::index::PageIndex; + +/// Returns the set of (row) intervals of the pages. +fn compute_row_page_intervals( + locations: &[PageLocation], + num_rows: u64, +) -> Result, ParquetError> { + if locations.is_empty() { + return Ok(vec![]); + }; + + let last = (|| { + let first = locations.last().unwrap().first_row_index; + let start = u64::try_from(first)?; + let length = num_rows - start; + Result::<_, ParquetError>::Ok((start, length)) + })(); + + let pages_lengths = locations + .windows(2) + .map(|x| { + let start = u64::try_from(x[0].first_row_index)?; + let length = u64::try_from(x[1].first_row_index - x[0].first_row_index)?; + Ok((start, length)) + }) + .chain(std::iter::once(last)); + pages_lengths.collect() +} + +/// Returns the set of intervals `(start, len)` containing all the +/// selected rows (for a given column) +pub fn compute_rows<'a, T>( + index: &'a [PageIndex], + locations: &[PageLocation], + num_rows: u64, + selector: &dyn Fn(&'a PageIndex) -> bool, +) -> Result, ParquetError> { + let page_intervals = compute_row_page_intervals(locations, num_rows)?; + + Ok(index + .iter() + .zip(page_intervals.iter().copied()) + .filter_map(|(index, page)| { + let is_selected = selector(index); + if is_selected { + Some(page) + } else { + None + } + }) + .collect()) +} diff --git a/src/indexes/mod.rs b/src/indexes/mod.rs index 6934a6f5b..1dd20f2f9 100644 --- a/src/indexes/mod.rs +++ b/src/indexes/mod.rs @@ -1,60 +1,81 @@ -use std::convert::TryInto; -use std::io::{Cursor, Read, Seek, SeekFrom}; - -use parquet_format_async_temp::{ - thrift::protocol::TCompactInputProtocol, ColumnChunk, ColumnIndex, OffsetIndex, PageLocation, -}; - -use crate::error::ParquetError; - -/// Read the [`ColumnIndex`] from the [`ColumnChunk`], if available. -pub fn read_column( - reader: &mut R, - chunk: &ColumnChunk, -) -> Result, ParquetError> { - let (offset, length): (u64, usize) = if let Some(offset) = chunk.column_index_offset { - let length = chunk.column_index_length.ok_or_else(|| { - ParquetError::OutOfSpec( - "The column length must exist if column offset exists".to_string(), - ) - })?; - (offset.try_into()?, length.try_into()?) - } else { - return Ok(None); - }; - - reader.seek(SeekFrom::Start(offset))?; - let mut data = vec![0; length]; - reader.read_exact(&mut data)?; - - let mut d = Cursor::new(&data); - let mut prot = TCompactInputProtocol::new(&mut d); - Ok(Some(ColumnIndex::read_from_in_protocol(&mut prot)?)) -} +mod index; +mod intervals; +mod read; + +pub use self::index::{ByteIndex, FixedLenByteIndex, Index, NativeIndex, PageIndex}; +pub use intervals::compute_rows; +pub use read::*; + +#[cfg(test)] +mod tests { + use parquet_format_async_temp::PageLocation; + + use super::*; + + #[test] + fn test_basic() { + let index = NativeIndex { + indexes: vec![PageIndex { + min: Some(0i32), + max: Some(10), + null_count: Some(0), + }], + boundary_order: Default::default(), + }; + let locations = &[PageLocation { + offset: 100, + compressed_page_size: 10, + first_row_index: 0, + }]; + let num_rows = 10; + + let selector = |_| true; + + let row_intervals = compute_rows(&index.indexes, locations, num_rows, &selector).unwrap(); + assert_eq!(row_intervals, vec![(0, 10)]) + } + + #[test] + fn test_multiple() { + // two pages + let index = ByteIndex { + indexes: vec![ + PageIndex { + min: Some(vec![0]), + max: Some(vec![8, 9]), + null_count: Some(0), + }, + PageIndex { + min: Some(vec![20]), + max: Some(vec![98, 99]), + null_count: Some(0), + }, + ], + boundary_order: Default::default(), + }; + let locations = &[ + PageLocation { + offset: 100, + compressed_page_size: 10, + first_row_index: 0, + }, + PageLocation { + offset: 110, + compressed_page_size: 20, + first_row_index: 5, + }, + ]; + let num_rows = 10; + + // filter of the form `x > "a"` + let selector = |page: &PageIndex>| { + page.max + .as_ref() + .map(|x| x.as_slice() > &[97]) + .unwrap_or(false) // no max is present => all nulls => not selected + }; -/// Read [`PageLocation`]s from the [`ColumnChunk`], if available. -pub fn read_page_locations( - reader: &mut R, - chunk: &ColumnChunk, -) -> Result>, ParquetError> { - let (offset, length): (u64, usize) = if let Some(offset) = chunk.offset_index_offset { - let length = chunk.offset_index_length.ok_or_else(|| { - ParquetError::OutOfSpec( - "The column length must exist if column offset exists".to_string(), - ) - })?; - (offset.try_into()?, length.try_into()?) - } else { - return Ok(None); - }; - - reader.seek(SeekFrom::Start(offset))?; - let mut data = vec![0; length]; - reader.read_exact(&mut data)?; - - let mut d = Cursor::new(&data); - let mut prot = TCompactInputProtocol::new(&mut d); - let offset = OffsetIndex::read_from_in_protocol(&mut prot)?; - - Ok(Some(offset.page_locations)) + let row_intervals = compute_rows(&index.indexes, locations, num_rows, &selector).unwrap(); + assert_eq!(row_intervals, vec![(5, 5)]) + } } diff --git a/src/indexes/read.rs b/src/indexes/read.rs new file mode 100644 index 000000000..6587edd9d --- /dev/null +++ b/src/indexes/read.rs @@ -0,0 +1,83 @@ +use std::convert::TryInto; +use std::io::{Cursor, Read, Seek, SeekFrom}; + +use parquet_format_async_temp::{ + thrift::protocol::TCompactInputProtocol, ColumnChunk, ColumnIndex, OffsetIndex, PageLocation, +}; + +use crate::error::ParquetError; +use crate::metadata::ColumnChunkMetaData; +use crate::schema::types::{ParquetType, PhysicalType}; + +use super::{ByteIndex, FixedLenByteIndex, Index, NativeIndex}; + +/// Read the [`ColumnIndex`] from the [`ColumnChunk`], if available. +pub fn read_column( + reader: &mut R, + chunk: &ColumnChunkMetaData, +) -> Result>, ParquetError> { + let metadata = chunk.column_chunk(); + let (offset, length): (u64, usize) = if let Some(offset) = metadata.column_index_offset { + let length = metadata.column_index_length.ok_or_else(|| { + ParquetError::OutOfSpec( + "The column length must exist if column offset exists".to_string(), + ) + })?; + (offset.try_into()?, length.try_into()?) + } else { + return Ok(None); + }; + + reader.seek(SeekFrom::Start(offset))?; + let mut data = vec![0; length]; + reader.read_exact(&mut data)?; + + let mut d = Cursor::new(&data); + let mut prot = TCompactInputProtocol::new(&mut d); + + let index = ColumnIndex::read_from_in_protocol(&mut prot)?; + let index = match chunk.descriptor().type_() { + ParquetType::PrimitiveType { physical_type, .. } => match physical_type { + PhysicalType::Boolean => return Ok(None), + PhysicalType::Int32 => Box::new(NativeIndex::::try_from(index)?) as Box, + PhysicalType::Int64 => Box::new(NativeIndex::::try_from(index)?) as _, + PhysicalType::Int96 => Box::new(NativeIndex::<[u32; 3]>::try_from(index)?) as _, + PhysicalType::Float => Box::new(NativeIndex::::try_from(index)?), + PhysicalType::Double => Box::new(NativeIndex::::try_from(index)?), + PhysicalType::ByteArray => Box::new(ByteIndex::try_from(index)?), + PhysicalType::FixedLenByteArray(size) => { + Box::new(FixedLenByteIndex::try_from((index, *size))?) + } + }, + _ => unreachable!(), + }; + + Ok(Some(index)) +} + +/// Read [`PageLocation`]s from the [`ColumnChunk`], if available. +pub fn read_page_locations( + reader: &mut R, + chunk: &ColumnChunk, +) -> Result>, ParquetError> { + let (offset, length): (u64, usize) = if let Some(offset) = chunk.offset_index_offset { + let length = chunk.offset_index_length.ok_or_else(|| { + ParquetError::OutOfSpec( + "The column length must exist if column offset exists".to_string(), + ) + })?; + (offset.try_into()?, length.try_into()?) + } else { + return Ok(None); + }; + + reader.seek(SeekFrom::Start(offset))?; + let mut data = vec![0; length]; + reader.read_exact(&mut data)?; + + let mut d = Cursor::new(&data); + let mut prot = TCompactInputProtocol::new(&mut d); + let offset = OffsetIndex::read_from_in_protocol(&mut prot)?; + + Ok(Some(offset.page_locations)) +} diff --git a/src/parquet_bridge.rs b/src/parquet_bridge.rs index 00077ae9e..ba4f23c65 100644 --- a/src/parquet_bridge.rs +++ b/src/parquet_bridge.rs @@ -3,6 +3,7 @@ use std::convert::TryFrom; use std::convert::TryInto; use crate::error::ParquetError; +use parquet_format_async_temp::BoundaryOrder as ParquetBoundaryOrder; use parquet_format_async_temp::CompressionCodec; use parquet_format_async_temp::DataPageHeader; use parquet_format_async_temp::DataPageHeaderV2; @@ -192,6 +193,48 @@ impl From for ParquetEncoding { } } +/// Enum to annotate whether lists of min/max elements inside ColumnIndex +/// are ordered and if so, in which direction. +#[derive(Debug, Eq, PartialEq, Hash, Clone, Copy)] +pub enum BoundaryOrder { + Unordered, + Ascending, + Descending, +} + +impl Default for BoundaryOrder { + fn default() -> Self { + Self::Unordered + } +} + +impl TryFrom for BoundaryOrder { + type Error = ParquetError; + + fn try_from(encoding: ParquetBoundaryOrder) -> Result { + Ok(match encoding { + ParquetBoundaryOrder::UNORDERED => BoundaryOrder::Unordered, + ParquetBoundaryOrder::ASCENDING => BoundaryOrder::Ascending, + ParquetBoundaryOrder::DESCENDING => BoundaryOrder::Descending, + _ => { + return Err(ParquetError::OutOfSpec( + "BoundaryOrder Thrift value out of range".to_string(), + )) + } + }) + } +} + +impl From for ParquetBoundaryOrder { + fn from(encoding: BoundaryOrder) -> Self { + match encoding { + BoundaryOrder::Unordered => ParquetBoundaryOrder::UNORDERED, + BoundaryOrder::Ascending => ParquetBoundaryOrder::ASCENDING, + BoundaryOrder::Descending => ParquetBoundaryOrder::DESCENDING, + } + } +} + pub trait DataPageHeaderExt { fn encoding(&self) -> Encoding; fn repetition_level_encoding(&self) -> Encoding; diff --git a/src/types.rs b/src/types.rs index 04171c63b..db01a6d8d 100644 --- a/src/types.rs +++ b/src/types.rs @@ -4,7 +4,7 @@ use crate::schema::types::PhysicalType; /// A physical native representation of a Parquet fixed-sized type. pub trait NativeType: Sized + Copy + std::fmt::Debug + Send + Sync + 'static { - type Bytes: AsRef<[u8]> + for<'a> TryFrom<&'a [u8]>; + type Bytes: AsRef<[u8]> + for<'a> TryFrom<&'a [u8], Error = std::array::TryFromSliceError>; fn to_le_bytes(&self) -> Self::Bytes; From 2a170d17412747705fa0cd1bdc62adebe36aac68 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Sat, 19 Mar 2022 05:24:03 +0000 Subject: [PATCH 02/16] Added logic to read filtered pages --- src/indexes/intervals.rs | 89 ++++++++++++++++++++- src/indexes/mod.rs | 164 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 245 insertions(+), 8 deletions(-) diff --git a/src/indexes/intervals.rs b/src/indexes/intervals.rs index 4550a9b63..5a3d0d269 100644 --- a/src/indexes/intervals.rs +++ b/src/indexes/intervals.rs @@ -4,11 +4,27 @@ use crate::error::ParquetError; use super::index::PageIndex; +/// An interval +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct Interval { + /// Its start + pub start: u64, + /// Its length + pub length: u64, +} + +impl Interval { + /// Create a new interal + pub fn new(start: u64, length: u64) -> Self { + Self { start, length } + } +} + /// Returns the set of (row) intervals of the pages. fn compute_row_page_intervals( locations: &[PageLocation], num_rows: u64, -) -> Result, ParquetError> { +) -> Result, ParquetError> { if locations.is_empty() { return Ok(vec![]); }; @@ -17,7 +33,7 @@ fn compute_row_page_intervals( let first = locations.last().unwrap().first_row_index; let start = u64::try_from(first)?; let length = num_rows - start; - Result::<_, ParquetError>::Ok((start, length)) + Result::<_, ParquetError>::Ok(Interval::new(start, length)) })(); let pages_lengths = locations @@ -25,7 +41,7 @@ fn compute_row_page_intervals( .map(|x| { let start = u64::try_from(x[0].first_row_index)?; let length = u64::try_from(x[1].first_row_index - x[0].first_row_index)?; - Ok((start, length)) + Ok(Interval::new(start, length)) }) .chain(std::iter::once(last)); pages_lengths.collect() @@ -38,7 +54,7 @@ pub fn compute_rows<'a, T>( locations: &[PageLocation], num_rows: u64, selector: &dyn Fn(&'a PageIndex) -> bool, -) -> Result, ParquetError> { +) -> Result, ParquetError> { let page_intervals = compute_row_page_intervals(locations, num_rows)?; Ok(index @@ -54,3 +70,68 @@ pub fn compute_rows<'a, T>( }) .collect()) } + +/// An enum describing a page that was either selected in a filter pushdown or skipped +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum FilteredPage { + Select { + /// Location of the page in the file + start: u64, + length: u64, + /// Location of rows to select in the page + rows_offset: u64, + rows_length: u64, + }, + Skip { + /// Location of the page in the file + start: u64, + length: u64, + /// number of rows that are skip by skipping this page + num_rows: usize, + }, +} + +fn is_in(probe: Interval, intervals: &[Interval]) -> Option { + intervals.iter().find_map(|interval| { + let interval_end = interval.start + interval.length; + let probe_end = probe.start + probe.length; + let overlaps = (probe.start < interval_end) && (probe_end > interval.start); + if overlaps { + let start = interval.start.max(probe.start); + let end = interval_end.min(probe_end); + Some(Interval::new(start - probe.start, end - start)) + } else { + None + } + }) +} + +/// Given a set of selected [Interval]s of rows and the set of page locations, returns the +pub fn select_pages( + intervals: &[Interval], + locations: &[PageLocation], + num_rows: u64, +) -> Result, ParquetError> { + let page_intervals = compute_row_page_intervals(locations, num_rows)?; + + page_intervals + .into_iter() + .zip(locations.iter()) + .map(|(interval, location)| { + Ok(if let Some(overlap) = is_in(interval, intervals) { + FilteredPage::Select { + start: location.offset.try_into()?, + length: location.compressed_page_size.try_into()?, + rows_offset: overlap.start, + rows_length: overlap.length, + } + } else { + FilteredPage::Skip { + start: location.offset.try_into()?, + length: location.compressed_page_size.try_into()?, + num_rows: interval.length.try_into()?, + } + }) + }) + .collect() +} diff --git a/src/indexes/mod.rs b/src/indexes/mod.rs index 1dd20f2f9..585e630d2 100644 --- a/src/indexes/mod.rs +++ b/src/indexes/mod.rs @@ -3,7 +3,7 @@ mod intervals; mod read; pub use self::index::{ByteIndex, FixedLenByteIndex, Index, NativeIndex, PageIndex}; -pub use intervals::compute_rows; +pub use intervals::{compute_rows, select_pages, FilteredPage, Interval}; pub use read::*; #[cfg(test)] @@ -32,7 +32,7 @@ mod tests { let selector = |_| true; let row_intervals = compute_rows(&index.indexes, locations, num_rows, &selector).unwrap(); - assert_eq!(row_intervals, vec![(0, 10)]) + assert_eq!(row_intervals, vec![Interval::new(0, 10)]) } #[test] @@ -75,7 +75,163 @@ mod tests { .unwrap_or(false) // no max is present => all nulls => not selected }; - let row_intervals = compute_rows(&index.indexes, locations, num_rows, &selector).unwrap(); - assert_eq!(row_intervals, vec![(5, 5)]) + let rows = compute_rows(&index.indexes, locations, num_rows, &selector).unwrap(); + assert_eq!(rows, vec![Interval::new(5, 5)]); + + let pages = select_pages(&rows, locations, num_rows).unwrap(); + + assert_eq!( + pages, + vec![ + FilteredPage::Skip { + start: 100, + length: 10, + num_rows: 5 + }, + FilteredPage::Select { + start: 110, + length: 20, + rows_offset: 0, + rows_length: 5 + } + ] + ); + } + + #[test] + fn test_other_column() { + let locations = &[ + PageLocation { + offset: 100, + compressed_page_size: 20, + first_row_index: 0, + }, + PageLocation { + offset: 120, + compressed_page_size: 20, + first_row_index: 10, + }, + ]; + let num_rows = 100; + + let intervals = &[Interval::new(5, 5)]; + + let pages = select_pages(intervals, locations, num_rows).unwrap(); + + assert_eq!( + pages, + vec![ + FilteredPage::Select { + start: 100, + length: 20, + rows_offset: 5, + rows_length: 5 + }, + FilteredPage::Skip { + start: 120, + length: 20, + num_rows: 90 + }, + ] + ); + } + + #[test] + fn test_other_interval_in_middle() { + let locations = &[ + PageLocation { + offset: 100, + compressed_page_size: 20, + first_row_index: 0, + }, + PageLocation { + offset: 120, + compressed_page_size: 20, + first_row_index: 10, + }, + PageLocation { + offset: 140, + compressed_page_size: 20, + first_row_index: 100, + }, + ]; + let num_rows = 200; + + // interval partially intersects 2 pages (0 and 1) + let intervals = &[Interval::new(5, 6)]; + + let pages = select_pages(intervals, locations, num_rows).unwrap(); + + assert_eq!( + pages, + vec![ + FilteredPage::Select { + start: 100, + length: 20, + rows_offset: 5, + rows_length: 5 + }, + FilteredPage::Select { + start: 120, + length: 20, + rows_offset: 0, + rows_length: 1 + }, + FilteredPage::Skip { + start: 140, + length: 20, + num_rows: 100 + }, + ] + ); + } + + #[test] + fn test_other_column2() { + let locations = &[ + PageLocation { + offset: 100, + compressed_page_size: 20, + first_row_index: 0, + }, + PageLocation { + offset: 120, + compressed_page_size: 20, + first_row_index: 10, + }, + PageLocation { + offset: 140, + compressed_page_size: 20, + first_row_index: 100, + }, + ]; + let num_rows = 200; + + // interval partially intersects 1 page (0) + let intervals = &[Interval::new(0, 1)]; + + let pages = select_pages(intervals, locations, num_rows).unwrap(); + + assert_eq!( + pages, + vec![ + FilteredPage::Select { + start: 100, + length: 20, + rows_offset: 0, + rows_length: 1 + }, + FilteredPage::Skip { + start: 120, + length: 20, + num_rows: 90 + }, + FilteredPage::Skip { + start: 140, + length: 20, + num_rows: 100 + }, + ] + ); } } From 7db3dba286053bebbc0c9911d7970355620df669 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Sat, 19 Mar 2022 07:58:39 +0000 Subject: [PATCH 03/16] Added sync reader of indexed pages --- src/indexes/intervals.rs | 21 ++- src/indexes/read.rs | 27 ++-- src/page/mod.rs | 16 ++- src/read/mod.rs | 22 +++- src/read/page/indexed_reader.rs | 227 ++++++++++++++++++++++++++++++++ src/read/page/mod.rs | 2 + src/read/page/reader.rs | 39 +++--- src/read/page/stream.rs | 1 + src/write/compression.rs | 1 + 9 files changed, 317 insertions(+), 39 deletions(-) create mode 100644 src/read/page/indexed_reader.rs diff --git a/src/indexes/intervals.rs b/src/indexes/intervals.rs index 5a3d0d269..1506dac99 100644 --- a/src/indexes/intervals.rs +++ b/src/indexes/intervals.rs @@ -77,20 +77,29 @@ pub enum FilteredPage { Select { /// Location of the page in the file start: u64, - length: u64, + length: usize, /// Location of rows to select in the page - rows_offset: u64, - rows_length: u64, + rows_offset: usize, + rows_length: usize, }, Skip { /// Location of the page in the file start: u64, - length: u64, + length: usize, /// number of rows that are skip by skipping this page num_rows: usize, }, } +impl FilteredPage { + pub fn start(&self) -> u64 { + match self { + Self::Select { start, .. } => *start, + Self::Skip { start, .. } => *start, + } + } +} + fn is_in(probe: Interval, intervals: &[Interval]) -> Option { intervals.iter().find_map(|interval| { let interval_end = interval.start + interval.length; @@ -122,8 +131,8 @@ pub fn select_pages( FilteredPage::Select { start: location.offset.try_into()?, length: location.compressed_page_size.try_into()?, - rows_offset: overlap.start, - rows_length: overlap.length, + rows_offset: overlap.start.try_into()?, + rows_length: overlap.length.try_into()?, } } else { FilteredPage::Skip { diff --git a/src/indexes/read.rs b/src/indexes/read.rs index 6587edd9d..49daa1d95 100644 --- a/src/indexes/read.rs +++ b/src/indexes/read.rs @@ -7,7 +7,7 @@ use parquet_format_async_temp::{ use crate::error::ParquetError; use crate::metadata::ColumnChunkMetaData; -use crate::schema::types::{ParquetType, PhysicalType}; +use crate::schema::types::PhysicalType; use super::{ByteIndex, FixedLenByteIndex, Index, NativeIndex}; @@ -36,20 +36,17 @@ pub fn read_column( let mut prot = TCompactInputProtocol::new(&mut d); let index = ColumnIndex::read_from_in_protocol(&mut prot)?; - let index = match chunk.descriptor().type_() { - ParquetType::PrimitiveType { physical_type, .. } => match physical_type { - PhysicalType::Boolean => return Ok(None), - PhysicalType::Int32 => Box::new(NativeIndex::::try_from(index)?) as Box, - PhysicalType::Int64 => Box::new(NativeIndex::::try_from(index)?) as _, - PhysicalType::Int96 => Box::new(NativeIndex::<[u32; 3]>::try_from(index)?) as _, - PhysicalType::Float => Box::new(NativeIndex::::try_from(index)?), - PhysicalType::Double => Box::new(NativeIndex::::try_from(index)?), - PhysicalType::ByteArray => Box::new(ByteIndex::try_from(index)?), - PhysicalType::FixedLenByteArray(size) => { - Box::new(FixedLenByteIndex::try_from((index, *size))?) - } - }, - _ => unreachable!(), + let index = match chunk.descriptor().descriptor.primitive_type.physical_type { + PhysicalType::Boolean => return Ok(None), + PhysicalType::Int32 => Box::new(NativeIndex::::try_from(index)?) as Box, + PhysicalType::Int64 => Box::new(NativeIndex::::try_from(index)?) as _, + PhysicalType::Int96 => Box::new(NativeIndex::<[u32; 3]>::try_from(index)?) as _, + PhysicalType::Float => Box::new(NativeIndex::::try_from(index)?), + PhysicalType::Double => Box::new(NativeIndex::::try_from(index)?), + PhysicalType::ByteArray => Box::new(ByteIndex::try_from(index)?), + PhysicalType::FixedLenByteArray(size) => { + Box::new(FixedLenByteIndex::try_from((index, size))?) + } }; Ok(Some(index)) diff --git a/src/page/mod.rs b/src/page/mod.rs index 19e21a566..d21ae4238 100644 --- a/src/page/mod.rs +++ b/src/page/mod.rs @@ -25,7 +25,10 @@ pub struct CompressedDataPage { compression: Compression, uncompressed_page_size: usize, pub(crate) dictionary_page: Option>, - pub descriptor: Descriptor, + pub(crate) descriptor: Descriptor, + + // The offset and length in rows + rows: Option<(usize, usize)>, } impl CompressedDataPage { @@ -36,6 +39,7 @@ impl CompressedDataPage { uncompressed_page_size: usize, dictionary_page: Option>, descriptor: Descriptor, + rows: Option<(usize, usize)>, ) -> Self { Self { header, @@ -44,6 +48,7 @@ impl CompressedDataPage { uncompressed_page_size, dictionary_page, descriptor, + rows, } } @@ -59,10 +64,19 @@ impl CompressedDataPage { self.buffer.len() } + /// The compression of the data in this page. + /// Note that what is compressed in a page depends on its version: + /// in V1, the whole data (`[repetition levels][definition levels][values]`) is compressed; in V2 only the values are compressed. pub fn compression(&self) -> Compression { self.compression } + /// the rows to be selected by this page. + /// When `None`, all rows are to be considered. + pub fn rows(&self) -> Option<(usize, usize)> { + self.rows + } + pub fn num_values(&self) -> usize { self.header.num_values() } diff --git a/src/read/mod.rs b/src/read/mod.rs index d2c3cfa43..50f4c592e 100644 --- a/src/read/mod.rs +++ b/src/read/mod.rs @@ -13,11 +13,12 @@ pub use compression::{decompress, BasicDecompressor, Decompressor}; pub use metadata::read_metadata; #[cfg(feature = "stream")] pub use page::get_page_stream; -pub use page::{PageFilter, PageReader}; +pub use page::{IndexedPageReader, PageFilter, PageReader}; #[cfg(feature = "stream")] pub use stream::read_metadata as read_metadata_async; use crate::error::ParquetError; +use crate::indexes::FilteredPage; use crate::metadata::{ColumnChunkMetaData, RowGroupMetaData}; use crate::page::CompressedDataPage; use crate::schema::types::ParquetType; @@ -61,6 +62,25 @@ pub fn get_page_iterator( )) } +/// Returns a new [`IndexedPageReader`] by seeking `reader` to the begining of `column_chunk`. +pub fn get_indexed_page_reader( + column_chunk: &ColumnChunkMetaData, + reader: R, + pages: Vec, + buffer: Vec, + data_buffer: Vec, +) -> Result> { + Ok(IndexedPageReader::new( + reader, + column_chunk.compression(), + column_chunk.descriptor().descriptor.clone(), + column_chunk.byte_range().0, + pages, + buffer, + data_buffer, + )) +} + /// Returns an [`Iterator`] of [`ColumnChunkMetaData`] corresponding to the columns /// from `field` at `row_group`. /// For primitive fields (e.g. `i64`), the iterator has exactly one item. diff --git a/src/read/page/indexed_reader.rs b/src/read/page/indexed_reader.rs new file mode 100644 index 000000000..11997d6e3 --- /dev/null +++ b/src/read/page/indexed_reader.rs @@ -0,0 +1,227 @@ +use std::{ + collections::VecDeque, + io::{Cursor, Read, Seek, SeekFrom}, + sync::Arc, +}; + +use crate::{ + error::ParquetError, + indexes::FilteredPage, + metadata::Descriptor, + page::{CompressedDataPage, DictPage, ParquetPageHeader}, + parquet_bridge::Compression, +}; + +use super::reader::{finish_page, read_page_header, FinishedPage}; + +enum LazyDict { + // The dictionary has been read and deserialized + Dictionary(Arc), + // The range of the dictionary page + Range(u64, usize), +} + +/// A fallible [`Iterator`] of [`CompressedDataPage`]. This iterator leverages page indexes +/// to skip pages that are not needed. Consequently, the pages from this +/// iterator always have [`Some`] [`CompressedDataPage::rows()`] +pub struct IndexedPageReader { + // The source + reader: R, + + compression: Compression, + + dictionary: Option, + + // used to deserialize dictionary pages and attach the descriptor to every read page + descriptor: Descriptor, + + // buffer to read the whole page [header][data] into memory + buffer: Vec, + + // buffer to store the data [data] and re-use across pages + data_buffer: Vec, + + pages: VecDeque, +} + +fn resize_buffer(buffer: &mut Vec, length: usize) { + // prepare buffer + if length > buffer.len() { + // dealloc and ignore region, replacing it by a new region + *buffer = vec![0u8; length]; + } else { + buffer.clear(); + buffer.resize(length, 0); + } +} + +fn read_page( + reader: &mut R, + start: u64, + length: usize, + buffer: &mut Vec, + data: &mut Vec, +) -> Result { + // seek to the page + reader.seek(SeekFrom::Start(start))?; + + // read [header][data] to buffer + resize_buffer(buffer, length); + reader.read_exact(buffer)?; + + // deserialize [header] + let mut reader = Cursor::new(buffer); + let page_header = read_page_header(&mut reader)?; + let header_size = reader.seek(SeekFrom::Current(0)).unwrap() as usize; + let buffer = reader.into_inner(); + + // copy [data] + data.clear(); + data.extend_from_slice(&buffer[header_size..]); + Ok(page_header) +} + +fn read_dict_page( + reader: &mut R, + start: u64, + length: usize, + buffer: &mut Vec, + data: &mut Vec, + compression: Compression, + descriptor: &Descriptor, +) -> Result, ParquetError> { + let page_header = read_page(reader, start, length, buffer, data)?; + + let result = finish_page(page_header, data, compression, &None, descriptor, None)?; + match result { + FinishedPage::Data(_) => Err(ParquetError::OutOfSpec( + "The first page is not a dictionary page but it should".to_string(), + )), + FinishedPage::Dict(dict) => Ok(dict), + } +} + +impl IndexedPageReader { + pub fn new( + reader: R, + compression: Compression, + descriptor: Descriptor, + column_start: u64, + pages: Vec, + buffer: Vec, + data_buffer: Vec, + ) -> Self { + // a dictionary page exists iff the first data page is not at the start of + // the column + let dictionary = match pages.get(0) { + Some(page) => { + let length = (page.start() - column_start) as usize; + if length > 0 { + Some(LazyDict::Range(column_start, length)) + } else { + None + } + } + None => None, + }; + + let pages = pages.into_iter().collect(); + Self { + reader, + compression, + descriptor, + buffer, + data_buffer, + pages, + dictionary, + } + } + + /// consumes self into the reader and the two internal buffers + pub fn into_inner(self) -> (R, Vec, Vec) { + (self.reader, self.buffer, self.data_buffer) + } + + fn read_page( + &mut self, + start: u64, + length: usize, + rows: (usize, usize), + ) -> Result { + // it will be read - take buffer + let mut data = std::mem::take(&mut self.data_buffer); + + // read the dictionary if needed + let dict = self + .dictionary + .as_mut() + .map(|dict| match &dict { + LazyDict::Dictionary(dict) => Ok(dict.clone()), + LazyDict::Range(start, length) => { + let maybe_page = read_dict_page( + &mut self.reader, + *start, + *length, + &mut self.buffer, + &mut data, + self.compression, + &self.descriptor, + ); + + match maybe_page { + Ok(d) => { + *dict = LazyDict::Dictionary(d.clone()); + Ok(d) + } + Err(e) => Err(e), + } + } + }) + .transpose()?; + + let page_header = read_page(&mut self.reader, start, length, &mut self.buffer, &mut data)?; + + finish_page( + page_header, + &mut data, + self.compression, + &dict, + &self.descriptor, + Some(rows), + ) + } +} + +impl Iterator for IndexedPageReader { + type Item = Result; + + fn next(&mut self) -> Option { + // todo: check if the first page is a dictionary page and read it accordingly so that + // we can attach it to data pages + + if let Some(page) = self.pages.pop_front() { + match page { + FilteredPage::Select { + start, + length, + rows_offset, + rows_length, + } => { + let page = match self.read_page(start, length, (rows_offset, rows_length)) { + Err(e) => return Some(Err(e)), + Ok(header) => header, + }; + match page { + FinishedPage::Data(page) => Some(Ok(page)), + FinishedPage::Dict(_) => Some(Err(ParquetError::OutOfSpec( + "Dictionary pages cannot be selected via indexes".to_string(), + ))), + } + } + FilteredPage::Skip { .. } => self.next(), + } + } else { + None + } + } +} diff --git a/src/read/page/mod.rs b/src/read/page/mod.rs index 9dcdee346..b395d9d85 100644 --- a/src/read/page/mod.rs +++ b/src/read/page/mod.rs @@ -1,9 +1,11 @@ +mod indexed_reader; mod reader; #[cfg(feature = "stream")] mod stream; use crate::{error::ParquetError, page::CompressedDataPage}; +pub use indexed_reader::IndexedPageReader; pub use reader::{PageFilter, PageReader}; pub trait PageIterator: Iterator> { diff --git a/src/read/page/reader.rs b/src/read/page/reader.rs index 0814f6932..56b5b8139 100644 --- a/src/read/page/reader.rs +++ b/src/read/page/reader.rs @@ -17,8 +17,11 @@ use super::PageIterator; /// Type declaration for a page filter pub type PageFilter = Arc bool + Send + Sync>; -/// A page iterator iterates over row group's pages. In parquet, pages are guaranteed to be -/// contiguously arranged in memory and therefore must be read in sequence. +/// A fallible [`Iterator`] of [`CompressedDataPage`]. This iterator reads pages back +/// to back until all pages have been consumed. +/// The pages from this iterator always have [`None`] [`CompressedDataPage::rows()`] since +/// filter pushdown is not supported without a +/// pre-computed [page index](https://github.com/apache/parquet-format/blob/master/PageIndex.md). pub struct PageReader { // The source reader: R, @@ -63,13 +66,6 @@ impl PageReader { } } - /// Reads Page header from Thrift. - fn read_page_header(&mut self) -> Result { - let mut prot = TCompactInputProtocol::new(&mut self.reader); - let page_header = ParquetPageHeader::read_from_in_protocol(&mut prot)?; - Ok(page_header) - } - pub fn into_inner(self) -> (R, Vec) { (self.reader, self.buffer) } @@ -104,6 +100,13 @@ impl Iterator for PageReader { } } +/// Reads Page header from Thrift. +pub(super) fn read_page_header(reader: &mut R) -> Result { + let mut prot = TCompactInputProtocol::new(reader); + let page_header = ParquetPageHeader::read_from_in_protocol(&mut prot)?; + Ok(page_header) +} + /// This function is lightweight and executes a minimal amount of work so that it is IO bounded. // Any un-necessary CPU-intensive tasks SHOULD be executed on individual pages. fn next_page( @@ -126,11 +129,11 @@ fn next_page( Ok(None) } -fn build_page( +pub(super) fn build_page( reader: &mut PageReader, buffer: &mut Vec, ) -> Result> { - let page_header = reader.read_page_header()?; + let page_header = read_page_header(&mut reader.reader)?; reader.seen_num_values += get_page_header(&page_header) .map(|x| x.num_values() as i64) .unwrap_or_default(); @@ -152,6 +155,7 @@ fn build_page( reader.compression, &reader.current_dictionary, &reader.descriptor, + None, )?; match result { @@ -170,10 +174,11 @@ pub(super) enum FinishedPage { pub(super) fn finish_page( page_header: ParquetPageHeader, - buffer: &mut Vec, + data: &mut Vec, compression: Compression, current_dictionary: &Option>, descriptor: &Descriptor, + rows: Option<(usize, usize)>, ) -> Result { let type_ = page_header.type_.try_into()?; match type_ { @@ -183,7 +188,7 @@ pub(super) fn finish_page( // move the buffer to `dict_page` let mut dict_page = - EncodedDictPage::new(std::mem::take(buffer), dict_header.num_values as usize); + EncodedDictPage::new(std::mem::take(data), dict_header.num_values as usize); let page = read_dict_page( &dict_page, @@ -192,7 +197,7 @@ pub(super) fn finish_page( descriptor.primitive_type.physical_type, )?; // take the buffer out of the `dict_page` to re-use it - std::mem::swap(&mut dict_page.buffer, buffer); + std::mem::swap(&mut dict_page.buffer, data); Ok(FinishedPage::Dict(page)) } @@ -201,11 +206,12 @@ pub(super) fn finish_page( Ok(FinishedPage::Data(CompressedDataPage::new( DataPageHeader::V1(header), - std::mem::take(buffer), + std::mem::take(data), compression, page_header.uncompressed_page_size as usize, current_dictionary.clone(), descriptor.clone(), + rows, ))) } PageType::DataPageV2 => { @@ -213,11 +219,12 @@ pub(super) fn finish_page( Ok(FinishedPage::Data(CompressedDataPage::new( DataPageHeader::V2(header), - std::mem::take(buffer), + std::mem::take(data), compression, page_header.uncompressed_page_size as usize, current_dictionary.clone(), descriptor.clone(), + rows, ))) } } diff --git a/src/read/page/stream.rs b/src/read/page/stream.rs index dcd7549ea..73fc230cb 100644 --- a/src/read/page/stream.rs +++ b/src/read/page/stream.rs @@ -71,6 +71,7 @@ fn _get_page_stream<'a, R: AsyncRead + AsyncSeek + Unpin + Send>( compression, ¤t_dictionary, descriptor, + None, )?; match result { diff --git a/src/write/compression.rs b/src/write/compression.rs index 2f13ced8a..c9d160ec0 100644 --- a/src/write/compression.rs +++ b/src/write/compression.rs @@ -47,6 +47,7 @@ fn compress_data( uncompressed_page_size, dictionary_page, descriptor, + None, // the writing of a page has no selected rows, so this value is not used )) } From cd8a8507817262c71ea1c893a99f1b1db66bbcb6 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Sat, 19 Mar 2022 12:05:23 +0000 Subject: [PATCH 04/16] Added rows to DataPage --- src/page/mod.rs | 11 ++++++++++- src/read/compression.rs | 2 ++ src/write/compression.rs | 3 ++- tests/it/write/primitive.rs | 1 + 4 files changed, 15 insertions(+), 2 deletions(-) diff --git a/src/page/mod.rs b/src/page/mod.rs index d21ae4238..4a9387dd0 100644 --- a/src/page/mod.rs +++ b/src/page/mod.rs @@ -28,7 +28,7 @@ pub struct CompressedDataPage { pub(crate) descriptor: Descriptor, // The offset and length in rows - rows: Option<(usize, usize)>, + pub(crate) rows: Option<(usize, usize)>, } impl CompressedDataPage { @@ -119,6 +119,7 @@ pub struct DataPage { pub(super) buffer: Vec, pub(super) dictionary_page: Option>, pub descriptor: Descriptor, + pub rows: Option<(usize, usize)>, } impl DataPage { @@ -127,12 +128,14 @@ impl DataPage { buffer: Vec, dictionary_page: Option>, descriptor: Descriptor, + rows: Option<(usize, usize)>, ) -> Self { Self { header, buffer, dictionary_page, descriptor, + rows, } } @@ -148,6 +151,12 @@ impl DataPage { &self.buffer } + /// the rows to be selected by this page. + /// When `None`, all rows are to be considered. + pub fn rows(&self) -> Option<(usize, usize)> { + self.rows + } + /// Returns a mutable reference to the internal buffer. /// Useful to recover the buffer after the page has been decoded. pub fn buffer_mut(&mut self) -> &mut Vec { diff --git a/src/read/compression.rs b/src/read/compression.rs index 3a2c1f918..2c9061bac 100644 --- a/src/read/compression.rs +++ b/src/read/compression.rs @@ -90,6 +90,7 @@ pub fn decompress( std::mem::take(buffer), compressed_page.dictionary_page, compressed_page.descriptor, + compressed_page.rows, )) } @@ -105,6 +106,7 @@ fn decompress_reuse( std::mem::take(buffer), compressed_page.dictionary_page, compressed_page.descriptor, + compressed_page.rows, ); if was_decompressed { diff --git a/src/write/compression.rs b/src/write/compression.rs index c9d160ec0..abb78c60a 100644 --- a/src/write/compression.rs +++ b/src/write/compression.rs @@ -18,6 +18,7 @@ fn compress_data( header, dictionary_page, descriptor, + rows, } = page; let uncompressed_page_size = buffer.len(); if compression != Compression::Uncompressed { @@ -47,7 +48,7 @@ fn compress_data( uncompressed_page_size, dictionary_page, descriptor, - None, // the writing of a page has no selected rows, so this value is not used + rows, )) } diff --git a/tests/it/write/primitive.rs b/tests/it/write/primitive.rs index bd1c8507e..e22518a4d 100644 --- a/tests/it/write/primitive.rs +++ b/tests/it/write/primitive.rs @@ -75,5 +75,6 @@ pub fn array_to_page_v1( buffer, None, descriptor.clone(), + None, ))) } From 6cb259ba0608832d6340a845a4bb81938b5ce2d6 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Sat, 19 Mar 2022 12:05:23 +0000 Subject: [PATCH 05/16] Added write of indexes --- examples/read_metadata.rs | 2 +- src/bloom_filter/read.rs | 5 +- src/indexes/index.rs | 59 ++++++++++++++-- src/indexes/mod.rs | 7 +- src/indexes/read.rs | 25 +++---- src/indexes/write.rs | 97 +++++++++++++++++++++++++++ src/metadata/column_chunk_metadata.rs | 31 ++++----- src/page/mod.rs | 14 ++++ src/write/column_chunk.rs | 4 +- src/write/file.rs | 47 ++++++++++++- src/write/mod.rs | 4 +- src/write/page.rs | 12 ++++ src/write/row_group.rs | 16 +++-- tests/it/write/mod.rs | 77 +++++++++++++++++++++ tests/it/write/primitive.rs | 2 +- 15 files changed, 348 insertions(+), 54 deletions(-) create mode 100644 src/indexes/write.rs diff --git a/examples/read_metadata.rs b/examples/read_metadata.rs index ed5246d4c..b7be56b82 100644 --- a/examples/read_metadata.rs +++ b/examples/read_metadata.rs @@ -65,7 +65,7 @@ fn main() -> Result<()> { } // read the offset index containing page locations - let maybe_pages = indexes::read_page_locations(&mut reader, column_metadata.column_chunk())?; + let maybe_pages = indexes::read_page_locations(&mut reader, column_metadata)?; if let Some(pages) = maybe_pages { // there are page locations in the file println!("{pages:?}"); diff --git a/src/bloom_filter/read.rs b/src/bloom_filter/read.rs index e06a1eb97..0bcd582ee 100644 --- a/src/bloom_filter/read.rs +++ b/src/bloom_filter/read.rs @@ -16,10 +16,7 @@ pub fn read( mut reader: &mut R, bitset: &mut Vec, ) -> Result<(), ParquetError> { - let offset = column_metadata - .metadata() - .ok_or_else(|| ParquetError::OutOfSpec("Column metadata is required".to_string()))? - .bloom_filter_offset; + let offset = column_metadata.metadata().bloom_filter_offset; let offset = if let Some(offset) = offset { offset as u64 diff --git a/src/indexes/index.rs b/src/indexes/index.rs index 38fb89d77..1f6c1cff0 100644 --- a/src/indexes/index.rs +++ b/src/indexes/index.rs @@ -11,15 +11,66 @@ pub trait Index: Send + Sync + std::fmt::Debug { fn physical_type(&self) -> &PhysicalType; } +impl PartialEq for dyn Index + '_ { + fn eq(&self, that: &dyn Index) -> bool { + equal(self, that) + } +} + +impl Eq for dyn Index + '_ {} + +fn equal(lhs: &dyn Index, rhs: &dyn Index) -> bool { + if lhs.physical_type() != rhs.physical_type() { + return false; + } + + match lhs.physical_type() { + PhysicalType::Boolean => unreachable!(), + PhysicalType::Int32 => { + lhs.as_any().downcast_ref::>().unwrap() + == rhs.as_any().downcast_ref::>().unwrap() + } + PhysicalType::Int64 => { + lhs.as_any().downcast_ref::>().unwrap() + == rhs.as_any().downcast_ref::>().unwrap() + } + PhysicalType::Int96 => { + lhs.as_any() + .downcast_ref::>() + .unwrap() + == rhs + .as_any() + .downcast_ref::>() + .unwrap() + } + PhysicalType::Float => { + lhs.as_any().downcast_ref::>().unwrap() + == rhs.as_any().downcast_ref::>().unwrap() + } + PhysicalType::Double => { + lhs.as_any().downcast_ref::>().unwrap() + == rhs.as_any().downcast_ref::>().unwrap() + } + PhysicalType::ByteArray => { + lhs.as_any().downcast_ref::().unwrap() + == rhs.as_any().downcast_ref::().unwrap() + } + PhysicalType::FixedLenByteArray(_) => { + lhs.as_any().downcast_ref::().unwrap() + == rhs.as_any().downcast_ref::().unwrap() + } + } +} + /// An index of a column of [`NativeType`] physical representation -#[derive(Debug, Clone, PartialEq, Hash)] +#[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct NativeIndex { pub indexes: Vec>, pub boundary_order: BoundaryOrder, } /// The index of a page, containing the min and max values of the page. -#[derive(Debug, Clone, PartialEq, Hash)] +#[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct PageIndex { /// The minimum value in the page. It is None when all values are null pub min: Option, @@ -80,7 +131,7 @@ impl Index for NativeIndex { } /// An index of a column of bytes physical type -#[derive(Debug, Clone, PartialEq, Hash)] +#[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct ByteIndex { pub indexes: Vec>>, pub boundary_order: BoundaryOrder, @@ -135,7 +186,7 @@ impl Index for ByteIndex { } /// An index of a column of fixed len byte physical type -#[derive(Debug, Clone, PartialEq, Hash)] +#[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct FixedLenByteIndex { pub type_: PhysicalType, pub indexes: Vec>>, diff --git a/src/indexes/mod.rs b/src/indexes/mod.rs index 585e630d2..6ac51ce97 100644 --- a/src/indexes/mod.rs +++ b/src/indexes/mod.rs @@ -1,15 +1,18 @@ mod index; mod intervals; mod read; +mod write; + +pub use crate::parquet_bridge::BoundaryOrder; +pub use parquet_format_async_temp::PageLocation; pub use self::index::{ByteIndex, FixedLenByteIndex, Index, NativeIndex, PageIndex}; pub use intervals::{compute_rows, select_pages, FilteredPage, Interval}; pub use read::*; +pub(crate) use write::*; #[cfg(test)] mod tests { - use parquet_format_async_temp::PageLocation; - use super::*; #[test] diff --git a/src/indexes/read.rs b/src/indexes/read.rs index 49daa1d95..544544e77 100644 --- a/src/indexes/read.rs +++ b/src/indexes/read.rs @@ -2,7 +2,7 @@ use std::convert::TryInto; use std::io::{Cursor, Read, Seek, SeekFrom}; use parquet_format_async_temp::{ - thrift::protocol::TCompactInputProtocol, ColumnChunk, ColumnIndex, OffsetIndex, PageLocation, + thrift::protocol::TCompactInputProtocol, ColumnIndex, OffsetIndex, PageLocation, }; use crate::error::ParquetError; @@ -55,18 +55,19 @@ pub fn read_column( /// Read [`PageLocation`]s from the [`ColumnChunk`], if available. pub fn read_page_locations( reader: &mut R, - chunk: &ColumnChunk, + chunk: &ColumnChunkMetaData, ) -> Result>, ParquetError> { - let (offset, length): (u64, usize) = if let Some(offset) = chunk.offset_index_offset { - let length = chunk.offset_index_length.ok_or_else(|| { - ParquetError::OutOfSpec( - "The column length must exist if column offset exists".to_string(), - ) - })?; - (offset.try_into()?, length.try_into()?) - } else { - return Ok(None); - }; + let (offset, length): (u64, usize) = + if let Some(offset) = chunk.column_chunk().offset_index_offset { + let length = chunk.column_chunk().offset_index_length.ok_or_else(|| { + ParquetError::OutOfSpec( + "The column length must exist if column offset exists".to_string(), + ) + })?; + (offset.try_into()?, length.try_into()?) + } else { + return Ok(None); + }; reader.seek(SeekFrom::Start(offset))?; let mut data = vec![0; length]; diff --git a/src/indexes/write.rs b/src/indexes/write.rs new file mode 100644 index 000000000..60307280e --- /dev/null +++ b/src/indexes/write.rs @@ -0,0 +1,97 @@ +use std::io::Write; + +use parquet_format_async_temp::BoundaryOrder; +use parquet_format_async_temp::ColumnIndex; + +use parquet_format_async_temp::thrift::protocol::TCompactOutputProtocol; +use parquet_format_async_temp::OffsetIndex; +use parquet_format_async_temp::PageLocation; +use parquet_format_async_temp::PageType; + +use crate::error::{ParquetError, Result}; +pub use crate::metadata::KeyValue; +use crate::statistics::serialize_statistics; + +use crate::write::page::PageWriteSpec; + +fn is_data_page(page: &PageWriteSpec) -> bool { + page.header.type_ == PageType::DATA_PAGE || page.header.type_ == PageType::DATA_PAGE_V2 +} + +pub fn write_column_index(writer: &mut W, pages: &[PageWriteSpec]) -> Result { + let mut null_pages = Vec::with_capacity(pages.len()); + let mut min_values = Vec::with_capacity(pages.len()); + let mut max_values = Vec::with_capacity(pages.len()); + let mut null_counts = Vec::with_capacity(pages.len()); + + pages + .iter() + .filter(|x| is_data_page(x)) + .try_for_each(|spec| { + if let Some(stats) = &spec.statistics { + let stats = serialize_statistics(stats.as_ref()); + + let null_count = stats.null_count.ok_or_else(|| { + ParquetError::OutOfSpec("null count of a page is required".to_string()) + })?; + + null_counts.push(null_count); + if null_count as usize == spec.num_values { + min_values.push(vec![0]); + max_values.push(vec![0]); + null_pages.push(true) + } else { + min_values.push(stats.min_value.ok_or_else(|| { + ParquetError::OutOfSpec("min value of a page is required".to_string()) + })?); + max_values.push(stats.max_value.ok_or_else(|| { + ParquetError::OutOfSpec("max value of a page is required".to_string()) + })?); + null_pages.push(false) + }; + + Result::Ok(()) + } else { + Err(ParquetError::OutOfSpec( + "options were set to write statistics but some pages miss them".to_string(), + )) + } + })?; + let index = ColumnIndex { + null_pages, + min_values, + max_values, + boundary_order: BoundaryOrder::UNORDERED, + null_counts: Some(null_counts), + }; + let mut protocol = TCompactOutputProtocol::new(writer); + Ok(index.write_to_out_protocol(&mut protocol)? as u64) +} + +pub fn write_offset_index(writer: &mut W, pages: &[PageWriteSpec]) -> Result { + let mut first_row_index = 0; + let page_locations = pages + .iter() + .filter(|x| is_data_page(x)) + .map(|spec| { + let location = PageLocation { + offset: spec.offset.try_into()?, + compressed_page_size: spec.bytes_written.try_into()?, + first_row_index, + }; + let num_rows = spec.num_rows.ok_or_else(|| { + ParquetError::OutOfSpec( + "options were set to write statistics but some data pages miss number of rows" + .to_string(), + ) + })?; + first_row_index += num_rows as i64; + Ok(location) + }) + .collect::>>()?; + + let offset_index = OffsetIndex { page_locations }; + + let mut protocol = TCompactOutputProtocol::new(&mut *writer); + Ok(offset_index.write_to_out_protocol(&mut protocol)? as u64) +} diff --git a/src/metadata/column_chunk_metadata.rs b/src/metadata/column_chunk_metadata.rs index 456d8f5df..c02b61696 100644 --- a/src/metadata/column_chunk_metadata.rs +++ b/src/metadata/column_chunk_metadata.rs @@ -46,19 +46,14 @@ impl ColumnChunkMetaData { &self.column_chunk } - // The column's metadata - fn column_metadata(&self) -> &ColumnMetaData { + // The column chunk's metadata + pub fn metadata(&self) -> &ColumnMetaData { self.column_chunk.meta_data.as_ref().unwrap() } - // The column's metadata - pub fn metadata(&self) -> Option<&ColumnMetaData> { - self.column_chunk.meta_data.as_ref() - } - /// Type of this column. Must be primitive. pub fn type_(&self) -> &Type { - &self.column_metadata().type_ + &self.metadata().type_ } /// The [`ColumnDescriptor`] for this column. This descriptor contains the physical and logical type @@ -75,7 +70,7 @@ impl ColumnChunkMetaData { /// Decodes the raw statistics into a statistics pub fn statistics(&self) -> Option>> { - self.column_metadata() + self.metadata() .statistics .as_ref() .map(|x| deserialize_statistics(x, self.column_descr.descriptor.primitive_type.clone())) @@ -83,47 +78,47 @@ impl ColumnChunkMetaData { /// Total number of values in this column chunk. pub fn num_values(&self) -> i64 { - self.column_metadata().num_values + self.metadata().num_values } /// [`Compression`] for this column. pub fn compression(&self) -> Compression { - self.column_metadata().codec.try_into().unwrap() + self.metadata().codec.try_into().unwrap() } /// Returns the total compressed data size of this column chunk. pub fn compressed_size(&self) -> i64 { - self.column_metadata().total_compressed_size + self.metadata().total_compressed_size } /// Returns the total uncompressed data size of this column chunk. pub fn uncompressed_size(&self) -> i64 { - self.column_metadata().total_uncompressed_size + self.metadata().total_uncompressed_size } /// Returns the offset for the column data. pub fn data_page_offset(&self) -> i64 { - self.column_metadata().data_page_offset + self.metadata().data_page_offset } /// Returns `true` if this column chunk contains a index page, `false` otherwise. pub fn has_index_page(&self) -> bool { - self.column_metadata().index_page_offset.is_some() + self.metadata().index_page_offset.is_some() } /// Returns the offset for the index page. pub fn index_page_offset(&self) -> Option { - self.column_metadata().index_page_offset + self.metadata().index_page_offset } /// Returns the offset for the dictionary page, if any. pub fn dictionary_page_offset(&self) -> Option { - self.column_metadata().dictionary_page_offset + self.metadata().dictionary_page_offset } /// Returns the encoding for this column pub fn column_encoding(&self) -> &Vec { - &self.column_metadata().encodings + &self.metadata().encodings } /// Returns the offset and length in bytes of the column chunk within the file diff --git a/src/page/mod.rs b/src/page/mod.rs index 4a9387dd0..aa0f09e2c 100644 --- a/src/page/mod.rs +++ b/src/page/mod.rs @@ -234,6 +234,20 @@ impl CompressedPage { CompressedPage::Dict(page) => &mut page.buffer, } } + + pub(crate) fn num_values(&self) -> usize { + match self { + CompressedPage::Data(page) => page.num_values(), + CompressedPage::Dict(_) => 0, + } + } + + pub(crate) fn rows(&self) -> Option<(usize, usize)> { + match self { + CompressedPage::Data(page) => page.rows, + CompressedPage::Dict(_) => None, + } + } } /// Splits the page buffer into 3 slices corresponding to (encoded rep levels, encoded def levels, encoded values) for v1 pages. diff --git a/src/write/column_chunk.rs b/src/write/column_chunk.rs index 47efc39e6..71aa5628d 100644 --- a/src/write/column_chunk.rs +++ b/src/write/column_chunk.rs @@ -29,7 +29,7 @@ pub fn write_column_chunk<'a, W, E>( descriptor: &ColumnDescriptor, compression: Compression, mut compressed_pages: DynStreamingIterator<'a, CompressedPage, E>, -) -> Result<(ColumnChunk, u64)> +) -> Result<(ColumnChunk, Vec, u64)> where W: Write, ParquetError: From, @@ -58,7 +58,7 @@ where .write_to_out_protocol(&mut protocol)? as u64; protocol.flush()?; - Ok((column_chunk, bytes_written)) + Ok((column_chunk, specs, bytes_written)) } pub async fn write_column_chunk_async( diff --git a/src/write/file.rs b/src/write/file.rs index 5dd3b673b..6f20acfd0 100644 --- a/src/write/file.rs +++ b/src/write/file.rs @@ -6,15 +6,18 @@ use parquet_format_async_temp::thrift::protocol::TCompactOutputProtocol; use parquet_format_async_temp::thrift::protocol::TOutputProtocol; use parquet_format_async_temp::RowGroup; -pub use crate::metadata::KeyValue; +use crate::indexes::{write_column_index, write_offset_index}; use crate::{ error::{ParquetError, Result}, metadata::SchemaDescriptor, FOOTER_SIZE, PARQUET_MAGIC, }; +use super::page::PageWriteSpec; use super::{row_group::write_row_group, RowGroupIter, WriteOptions}; +pub use crate::metadata::KeyValue; + pub(super) fn start_file(writer: &mut W) -> Result { writer.write_all(&PARQUET_MAGIC)?; Ok(PARQUET_MAGIC.len() as u64) @@ -49,6 +52,7 @@ pub struct FileWriter { offset: u64, row_groups: Vec, + page_specs: Vec>>, } // Accessors @@ -79,6 +83,7 @@ impl FileWriter { created_by, offset: 0, row_groups: vec![], + page_specs: vec![], } } @@ -102,7 +107,7 @@ impl FileWriter { )); } let ordinal = self.row_groups.len(); - let (group, size) = write_row_group( + let (group, specs, size) = write_row_group( &mut self.writer, self.offset, self.schema.columns(), @@ -113,6 +118,7 @@ impl FileWriter { )?; self.offset += size; self.row_groups.push(group); + self.page_specs.push(specs); Ok(()) } @@ -122,6 +128,43 @@ impl FileWriter { // compute file stats let num_rows = self.row_groups.iter().map(|group| group.num_rows).sum(); + if self.options.write_statistics { + // write column indexes + self.row_groups + .iter_mut() + .zip(self.page_specs.iter()) + .try_for_each(|(group, pages)| { + group.columns.iter_mut().zip(pages.iter()).try_for_each( + |(column, pages)| { + let offset = self.offset; + column.column_index_offset = Some(offset as i64); + self.offset += write_column_index(&mut self.writer, pages)?; + let length = self.offset - offset; + column.column_index_length = Some(length as i32); + Result::Ok(()) + }, + )?; + Result::Ok(()) + })?; + + // write offset index + self.row_groups + .iter_mut() + .zip(self.page_specs.iter()) + .try_for_each(|(group, pages)| { + group.columns.iter_mut().zip(pages.iter()).try_for_each( + |(column, pages)| { + let offset = self.offset; + column.offset_index_offset = Some(offset as i64); + self.offset += write_offset_index(&mut self.writer, pages)?; + column.offset_index_length = Some((self.offset - offset) as i32); + Result::Ok(()) + }, + )?; + Result::Ok(()) + })?; + }; + let metadata = FileMetaData::new( self.options.version.into(), self.schema.into_thrift()?, diff --git a/src/write/mod.rs b/src/write/mod.rs index 1e324180d..2b9b27a72 100644 --- a/src/write/mod.rs +++ b/src/write/mod.rs @@ -1,7 +1,7 @@ mod column_chunk; mod compression; mod file; -mod page; +pub(crate) mod page; mod row_group; pub(self) mod statistics; @@ -28,7 +28,7 @@ pub type RowGroupIter<'a, E> = /// Write options of different interfaces on this crate #[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] pub struct WriteOptions { - /// Whether to write statistics + /// Whether to write statistics, including indexes pub write_statistics: bool, /// Whether to use compression pub compression: Compression, diff --git a/src/write/page.rs b/src/write/page.rs index bef6dc8ab..7c2c97662 100644 --- a/src/write/page.rs +++ b/src/write/page.rs @@ -35,6 +35,8 @@ fn maybe_bytes(uncompressed: usize, compressed: usize) -> Result<(i32, i32)> { /// Contains page write metrics. pub struct PageWriteSpec { pub header: ParquetPageHeader, + pub num_values: usize, + pub num_rows: Option, pub header_size: u64, pub offset: u64, pub bytes_written: u64, @@ -46,6 +48,9 @@ pub fn write_page( offset: u64, compressed_page: &CompressedPage, ) -> Result { + let num_values = compressed_page.num_values(); + let rows = compressed_page.rows(); + let header = match &compressed_page { CompressedPage::Data(compressed_page) => assemble_data_page_header(compressed_page), CompressedPage::Dict(compressed_page) => assemble_dict_page_header(compressed_page), @@ -76,6 +81,8 @@ pub fn write_page( offset, bytes_written, statistics, + num_rows: rows.map(|x| x.1), + num_values, }) } @@ -84,6 +91,9 @@ pub async fn write_page_async( offset: u64, compressed_page: &CompressedPage, ) -> Result { + let num_values = compressed_page.num_values(); + let rows = compressed_page.rows(); + let header = match &compressed_page { CompressedPage::Data(compressed_page) => assemble_data_page_header(compressed_page), CompressedPage::Dict(compressed_page) => assemble_dict_page_header(compressed_page), @@ -114,6 +124,8 @@ pub async fn write_page_async( offset, bytes_written, statistics, + num_rows: rows.map(|x| x.1), + num_values, }) } diff --git a/src/write/row_group.rs b/src/write/row_group.rs index d805b5301..1b61f8d84 100644 --- a/src/write/row_group.rs +++ b/src/write/row_group.rs @@ -12,6 +12,7 @@ use crate::{ use super::{ column_chunk::{write_column_chunk, write_column_chunk_async}, + page::PageWriteSpec, DynIter, DynStreamingIterator, }; @@ -63,7 +64,7 @@ pub fn write_row_group< columns: DynIter<'a, std::result::Result, E>>, num_rows: usize, ordinal: usize, -) -> Result<(RowGroup, u64)> +) -> Result<(RowGroup, Vec>, u64)> where W: Write, ParquetError: From, @@ -74,10 +75,10 @@ where let initial = offset; let columns = column_iter .map(|(descriptor, page_iter)| { - let (column, size) = + let (column, page_specs, size) = write_column_chunk(writer, offset, descriptor, compression, page_iter?)?; offset += size; - Ok(column) + Ok((column, page_specs)) }) .collect::>>()?; let bytes_written = offset - initial; @@ -85,20 +86,22 @@ where // compute row group stats let file_offset = columns .get(0) - .map(|column_chunk| { + .map(|(column_chunk, _)| { ColumnOffsetsMetadata::from_column_chunk(column_chunk).calc_row_group_file_offset() }) .unwrap_or(None); let total_byte_size = columns .iter() - .map(|c| c.meta_data.as_ref().unwrap().total_uncompressed_size) + .map(|(c, _)| c.meta_data.as_ref().unwrap().total_uncompressed_size) .sum(); let total_compressed_size = columns .iter() - .map(|c| c.meta_data.as_ref().unwrap().total_compressed_size) + .map(|(c, _)| c.meta_data.as_ref().unwrap().total_compressed_size) .sum(); + let (columns, specs) = columns.into_iter().unzip(); + Ok(( RowGroup { columns, @@ -109,6 +112,7 @@ where total_compressed_size: Some(total_compressed_size), ordinal: ordinal.try_into().ok(), }, + specs, bytes_written, )) } diff --git a/tests/it/write/mod.rs b/tests/it/write/mod.rs index 28ce5f558..e58f6a94c 100644 --- a/tests/it/write/mod.rs +++ b/tests/it/write/mod.rs @@ -5,6 +5,7 @@ use std::sync::Arc; use parquet2::compression::Compression; use parquet2::error::Result; +use parquet2::indexes::{self, BoundaryOrder, Index, NativeIndex, PageIndex, PageLocation}; use parquet2::metadata::SchemaDescriptor; use parquet2::read::read_metadata; use parquet2::statistics::Statistics; @@ -183,3 +184,79 @@ fn basic() -> Result<()> { Ok(()) } + +#[test] +fn indexes() -> Result<()> { + let array1 = vec![Some(0), Some(1), None, Some(3), Some(4), Some(5), Some(6)]; + let array2 = vec![Some(10), Some(11)]; + + let options = WriteOptions { + write_statistics: true, + compression: Compression::Uncompressed, + version: Version::V1, + }; + + let schema = SchemaDescriptor::try_from_message("message schema { OPTIONAL INT32 col; }")?; + + let pages = vec![ + array_to_page_v1::(&array1, &options, &schema.columns()[0].descriptor), + array_to_page_v1::(&array2, &options, &schema.columns()[0].descriptor), + ]; + + let expected_page_locations = vec![ + PageLocation { + offset: 4, + compressed_page_size: 63, + first_row_index: 0, + }, + PageLocation { + offset: 67, + compressed_page_size: 47, + first_row_index: array1.len() as i64, + }, + ]; + let expected_index = Box::new(NativeIndex:: { + indexes: vec![ + PageIndex { + min: Some(0), + max: Some(6), + null_count: Some(1), + }, + PageIndex { + min: Some(10), + max: Some(11), + null_count: Some(0), + }, + ], + boundary_order: BoundaryOrder::Unordered, + }) as Box; + + let pages = DynStreamingIterator::new(Compressor::new( + DynIter::new(pages.into_iter()), + options.compression, + vec![], + )); + let columns = std::iter::once(Ok(pages)); + + let writer = Cursor::new(vec![]); + let mut writer = FileWriter::new(writer, schema, options, None); + + writer.start()?; + writer.write(DynIter::new(columns), 7 + 2)?; + let writer = writer.end(None)?.1; + + let data = writer.into_inner(); + let mut reader = Cursor::new(data); + + let metadata = read_metadata(&mut reader)?; + + let column_metadata = &metadata.row_groups[0].columns()[0]; + + let index = indexes::read_column(&mut reader, column_metadata)?.expect("column index"); + assert_eq!(&index, &expected_index); + + let pages = indexes::read_page_locations(&mut reader, column_metadata)?.expect("offset index"); + assert_eq!(pages, expected_page_locations); + + Ok(()) +} diff --git a/tests/it/write/primitive.rs b/tests/it/write/primitive.rs index e22518a4d..ecad35756 100644 --- a/tests/it/write/primitive.rs +++ b/tests/it/write/primitive.rs @@ -75,6 +75,6 @@ pub fn array_to_page_v1( buffer, None, descriptor.clone(), - None, + Some((0, array.len())), ))) } From f0519a6c2a959b058a5979d6f4ec8b8b332a6a93 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Sun, 20 Mar 2022 07:14:01 +0000 Subject: [PATCH 06/16] Improved API --- examples/read_metadata.rs | 6 +++--- src/indexes/mod.rs | 6 ++---- src/indexes/read.rs | 2 +- src/read/mod.rs | 2 ++ src/write/file.rs | 2 +- tests/it/write/mod.rs | 8 ++++---- 6 files changed, 13 insertions(+), 13 deletions(-) diff --git a/examples/read_metadata.rs b/examples/read_metadata.rs index b7be56b82..025cfd229 100644 --- a/examples/read_metadata.rs +++ b/examples/read_metadata.rs @@ -1,6 +1,5 @@ use parquet2::bloom_filter; use parquet2::error::Result; -use parquet2::indexes; // ANCHOR: deserialize use parquet2::encoding::Encoding; @@ -57,7 +56,8 @@ fn main() -> Result<()> { // ANCHOR: column_index // read the column index - let index = indexes::read_column(&mut reader, column_metadata)?; + use parquet2::read; + let index = read::read_column_index(&mut reader, column_metadata)?; if let Some(index) = index { // these are the minimum and maximum within each page, which can be used // to skip pages. @@ -65,7 +65,7 @@ fn main() -> Result<()> { } // read the offset index containing page locations - let maybe_pages = indexes::read_page_locations(&mut reader, column_metadata)?; + let maybe_pages = read::read_page_locations(&mut reader, column_metadata)?; if let Some(pages) = maybe_pages { // there are page locations in the file println!("{pages:?}"); diff --git a/src/indexes/mod.rs b/src/indexes/mod.rs index 6ac51ce97..64648c3e7 100644 --- a/src/indexes/mod.rs +++ b/src/indexes/mod.rs @@ -1,15 +1,13 @@ mod index; mod intervals; -mod read; -mod write; +pub(crate) mod read; +pub(crate) mod write; pub use crate::parquet_bridge::BoundaryOrder; pub use parquet_format_async_temp::PageLocation; pub use self::index::{ByteIndex, FixedLenByteIndex, Index, NativeIndex, PageIndex}; pub use intervals::{compute_rows, select_pages, FilteredPage, Interval}; -pub use read::*; -pub(crate) use write::*; #[cfg(test)] mod tests { diff --git a/src/indexes/read.rs b/src/indexes/read.rs index 544544e77..83fcb08a7 100644 --- a/src/indexes/read.rs +++ b/src/indexes/read.rs @@ -12,7 +12,7 @@ use crate::schema::types::PhysicalType; use super::{ByteIndex, FixedLenByteIndex, Index, NativeIndex}; /// Read the [`ColumnIndex`] from the [`ColumnChunk`], if available. -pub fn read_column( +pub fn read_column_index( reader: &mut R, chunk: &ColumnChunkMetaData, ) -> Result>, ParquetError> { diff --git a/src/read/mod.rs b/src/read/mod.rs index 50f4c592e..f8af9dab1 100644 --- a/src/read/mod.rs +++ b/src/read/mod.rs @@ -24,6 +24,8 @@ use crate::page::CompressedDataPage; use crate::schema::types::ParquetType; use crate::{error::Result, metadata::FileMetaData}; +pub use crate::indexes::read::{read_column_index, read_page_locations}; + /// Filters row group metadata to only those row groups, /// for which the predicate function returns true pub fn filter_row_groups( diff --git a/src/write/file.rs b/src/write/file.rs index 6f20acfd0..eb34a2d04 100644 --- a/src/write/file.rs +++ b/src/write/file.rs @@ -6,7 +6,7 @@ use parquet_format_async_temp::thrift::protocol::TCompactOutputProtocol; use parquet_format_async_temp::thrift::protocol::TOutputProtocol; use parquet_format_async_temp::RowGroup; -use crate::indexes::{write_column_index, write_offset_index}; +use crate::indexes::write::{write_column_index, write_offset_index}; use crate::{ error::{ParquetError, Result}, metadata::SchemaDescriptor, diff --git a/tests/it/write/mod.rs b/tests/it/write/mod.rs index e58f6a94c..b5f7b21da 100644 --- a/tests/it/write/mod.rs +++ b/tests/it/write/mod.rs @@ -5,9 +5,9 @@ use std::sync::Arc; use parquet2::compression::Compression; use parquet2::error::Result; -use parquet2::indexes::{self, BoundaryOrder, Index, NativeIndex, PageIndex, PageLocation}; +use parquet2::indexes::{BoundaryOrder, Index, NativeIndex, PageIndex, PageLocation}; use parquet2::metadata::SchemaDescriptor; -use parquet2::read::read_metadata; +use parquet2::read::{read_column_index, read_metadata, read_page_locations}; use parquet2::statistics::Statistics; use parquet2::write::{Compressor, DynIter, DynStreamingIterator, FileWriter, Version}; use parquet2::{metadata::Descriptor, page::EncodedPage, write::WriteOptions}; @@ -252,10 +252,10 @@ fn indexes() -> Result<()> { let column_metadata = &metadata.row_groups[0].columns()[0]; - let index = indexes::read_column(&mut reader, column_metadata)?.expect("column index"); + let index = read_column_index(&mut reader, column_metadata)?.expect("column index"); assert_eq!(&index, &expected_index); - let pages = indexes::read_page_locations(&mut reader, column_metadata)?.expect("offset index"); + let pages = read_page_locations(&mut reader, column_metadata)?.expect("offset index"); assert_eq!(pages, expected_page_locations); Ok(()) From 065c65b140039c5501db515211184c9ac937a14b Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Sun, 20 Mar 2022 07:18:26 +0000 Subject: [PATCH 07/16] Clean --- tests/it/write/mod.rs | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/tests/it/write/mod.rs b/tests/it/write/mod.rs index b5f7b21da..0824adec1 100644 --- a/tests/it/write/mod.rs +++ b/tests/it/write/mod.rs @@ -203,6 +203,27 @@ fn indexes() -> Result<()> { array_to_page_v1::(&array2, &options, &schema.columns()[0].descriptor), ]; + let pages = DynStreamingIterator::new(Compressor::new( + DynIter::new(pages.into_iter()), + options.compression, + vec![], + )); + let columns = std::iter::once(Ok(pages)); + + let writer = Cursor::new(vec![]); + let mut writer = FileWriter::new(writer, schema, options, None); + + writer.start()?; + writer.write(DynIter::new(columns), 7 + 2)?; + let writer = writer.end(None)?.1; + + let data = writer.into_inner(); + let mut reader = Cursor::new(data); + + let metadata = read_metadata(&mut reader)?; + + let column_metadata = &metadata.row_groups[0].columns()[0]; + let expected_page_locations = vec![ PageLocation { offset: 4, @@ -231,27 +252,6 @@ fn indexes() -> Result<()> { boundary_order: BoundaryOrder::Unordered, }) as Box; - let pages = DynStreamingIterator::new(Compressor::new( - DynIter::new(pages.into_iter()), - options.compression, - vec![], - )); - let columns = std::iter::once(Ok(pages)); - - let writer = Cursor::new(vec![]); - let mut writer = FileWriter::new(writer, schema, options, None); - - writer.start()?; - writer.write(DynIter::new(columns), 7 + 2)?; - let writer = writer.end(None)?.1; - - let data = writer.into_inner(); - let mut reader = Cursor::new(data); - - let metadata = read_metadata(&mut reader)?; - - let column_metadata = &metadata.row_groups[0].columns()[0]; - let index = read_column_index(&mut reader, column_metadata)?.expect("column index"); assert_eq!(&index, &expected_index); From 6791aaccee384775aae935b32b9158b2f51ac078 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Sun, 20 Mar 2022 18:45:55 +0000 Subject: [PATCH 08/16] Improved API --- src/indexes/write.rs | 7 +------ src/write/file.rs | 3 +-- src/write/page.rs | 4 ++++ src/write/row_group.rs | 24 +++++++++++++++++++++--- tests/it/write/mod.rs | 7 +++---- 5 files changed, 30 insertions(+), 15 deletions(-) diff --git a/src/indexes/write.rs b/src/indexes/write.rs index 60307280e..26974e5a0 100644 --- a/src/indexes/write.rs +++ b/src/indexes/write.rs @@ -6,17 +6,12 @@ use parquet_format_async_temp::ColumnIndex; use parquet_format_async_temp::thrift::protocol::TCompactOutputProtocol; use parquet_format_async_temp::OffsetIndex; use parquet_format_async_temp::PageLocation; -use parquet_format_async_temp::PageType; use crate::error::{ParquetError, Result}; pub use crate::metadata::KeyValue; use crate::statistics::serialize_statistics; -use crate::write::page::PageWriteSpec; - -fn is_data_page(page: &PageWriteSpec) -> bool { - page.header.type_ == PageType::DATA_PAGE || page.header.type_ == PageType::DATA_PAGE_V2 -} +use crate::write::page::{is_data_page, PageWriteSpec}; pub fn write_column_index(writer: &mut W, pages: &[PageWriteSpec]) -> Result { let mut null_pages = Vec::with_capacity(pages.len()); diff --git a/src/write/file.rs b/src/write/file.rs index eb34a2d04..2fabd0099 100644 --- a/src/write/file.rs +++ b/src/write/file.rs @@ -96,7 +96,7 @@ impl FileWriter { /// Writes a row group to the file. /// /// This call is IO-bounded - pub fn write(&mut self, row_group: RowGroupIter<'_, E>, num_rows: usize) -> Result<()> + pub fn write(&mut self, row_group: RowGroupIter<'_, E>) -> Result<()> where ParquetError: From, E: std::error::Error, @@ -113,7 +113,6 @@ impl FileWriter { self.schema.columns(), self.options.compression, row_group, - num_rows, ordinal, )?; self.offset += size; diff --git a/src/write/page.rs b/src/write/page.rs index 7c2c97662..30fa75f44 100644 --- a/src/write/page.rs +++ b/src/write/page.rs @@ -14,6 +14,10 @@ use crate::page::{ }; use crate::statistics::Statistics; +pub(crate) fn is_data_page(page: &PageWriteSpec) -> bool { + page.header.type_ == PageType::DATA_PAGE || page.header.type_ == PageType::DATA_PAGE_V2 +} + fn maybe_bytes(uncompressed: usize, compressed: usize) -> Result<(i32, i32)> { let uncompressed_page_size: i32 = uncompressed.try_into().map_err(|_| { ParquetError::OutOfSpec(format!( diff --git a/src/write/row_group.rs b/src/write/row_group.rs index 1b61f8d84..604fecd73 100644 --- a/src/write/row_group.rs +++ b/src/write/row_group.rs @@ -12,7 +12,7 @@ use crate::{ use super::{ column_chunk::{write_column_chunk, write_column_chunk_async}, - page::PageWriteSpec, + page::{is_data_page, PageWriteSpec}, DynIter, DynStreamingIterator, }; @@ -62,7 +62,6 @@ pub fn write_row_group< descriptors: &[ColumnDescriptor], compression: Compression, columns: DynIter<'a, std::result::Result, E>>, - num_rows: usize, ordinal: usize, ) -> Result<(RowGroup, Vec>, u64)> where @@ -83,6 +82,25 @@ where .collect::>>()?; let bytes_written = offset - initial; + let num_rows = columns + .get(0) + .map(|(_, specs)| { + let mut num_rows = 0; + specs + .iter() + .filter(|x| is_data_page(x)) + .try_for_each(|spec| { + num_rows += spec.num_rows.ok_or_else(|| { + ParquetError::OutOfSpec( + "All data pages must declare a number of rows on it".to_string(), + ) + })? as i64; + Result::Ok(()) + })?; + Result::Ok(num_rows) + }) + .unwrap_or(Ok(0))?; + // compute row group stats let file_offset = columns .get(0) @@ -106,7 +124,7 @@ where RowGroup { columns, total_byte_size, - num_rows: num_rows as i64, + num_rows, sorting_columns: None, file_offset, total_compressed_size: Some(total_compressed_size), diff --git a/tests/it/write/mod.rs b/tests/it/write/mod.rs index 0824adec1..d1ffda9a9 100644 --- a/tests/it/write/mod.rs +++ b/tests/it/write/mod.rs @@ -60,7 +60,6 @@ fn test_column(column: usize) -> Result<()> { let a = schema.columns(); - let num_rows = array.len(); let pages = DynStreamingIterator::new(Compressor::new_from_vec( DynIter::new(std::iter::once(array_to_page( &array, @@ -76,7 +75,7 @@ fn test_column(column: usize) -> Result<()> { let mut writer = FileWriter::new(writer, schema, options, None); writer.start()?; - writer.write(DynIter::new(columns), num_rows)?; + writer.write(DynIter::new(columns))?; let writer = writer.end(None)?.1; let data = writer.into_inner(); @@ -167,7 +166,7 @@ fn basic() -> Result<()> { let mut writer = FileWriter::new(writer, schema, options, None); writer.start()?; - writer.write(DynIter::new(columns), 7)?; + writer.write(DynIter::new(columns))?; let writer = writer.end(None)?.1; let data = writer.into_inner(); @@ -214,7 +213,7 @@ fn indexes() -> Result<()> { let mut writer = FileWriter::new(writer, schema, options, None); writer.start()?; - writer.write(DynIter::new(columns), 7 + 2)?; + writer.write(DynIter::new(columns))?; let writer = writer.end(None)?.1; let data = writer.into_inner(); From 4e4f8db52763c0f9d1bb934d566cb81ec9e0ae8c Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Sun, 20 Mar 2022 19:25:08 +0000 Subject: [PATCH 09/16] Fixed error --- src/statistics/fixed_len_binary.rs | 4 +- src/write/column_chunk.rs | 8 ++-- src/write/row_group.rs | 63 ++++++++++++++++++------------ src/write/stream.rs | 5 +-- 4 files changed, 45 insertions(+), 35 deletions(-) diff --git a/src/statistics/fixed_len_binary.rs b/src/statistics/fixed_len_binary.rs index 368d6e7b3..4861b854b 100644 --- a/src/statistics/fixed_len_binary.rs +++ b/src/statistics/fixed_len_binary.rs @@ -15,7 +15,6 @@ pub struct FixedLenStatistics { pub distinct_count: Option, pub max_value: Option>, pub min_value: Option>, - pub(self) physical_type: PhysicalType, } impl Statistics for FixedLenStatistics { @@ -24,7 +23,7 @@ impl Statistics for FixedLenStatistics { } fn physical_type(&self) -> &PhysicalType { - &self.physical_type + &self.primitive_type.physical_type } fn null_count(&self) -> Option { @@ -64,7 +63,6 @@ pub fn read( x.truncate(size as usize); x }), - physical_type: PhysicalType::FixedLenByteArray(size), })) } diff --git a/src/write/column_chunk.rs b/src/write/column_chunk.rs index 71aa5628d..7c0a3e1b0 100644 --- a/src/write/column_chunk.rs +++ b/src/write/column_chunk.rs @@ -67,7 +67,7 @@ pub async fn write_column_chunk_async( descriptor: &ColumnDescriptor, compression: Compression, mut compressed_pages: DynStreamingIterator<'_, CompressedPage, E>, -) -> Result<(ColumnChunk, usize)> +) -> Result<(ColumnChunk, Vec, u64)> where W: AsyncWrite + Unpin + Send, ParquetError: From, @@ -81,7 +81,7 @@ where offset += spec.bytes_written; specs.push(spec); } - let mut bytes_written = (offset - initial) as usize; + let mut bytes_written = offset - initial; let column_chunk = build_column_chunk(&specs, descriptor, compression)?; @@ -92,10 +92,10 @@ where .as_ref() .unwrap() .write_to_out_stream_protocol(&mut protocol) - .await?; + .await? as u64; protocol.flush().await?; - Ok((column_chunk, bytes_written)) + Ok((column_chunk, specs, bytes_written)) } fn build_column_chunk( diff --git a/src/write/row_group.rs b/src/write/row_group.rs index 604fecd73..50f5221a7 100644 --- a/src/write/row_group.rs +++ b/src/write/row_group.rs @@ -52,6 +52,27 @@ impl ColumnOffsetsMetadata { } } +fn compute_num_rows(columns: &[(ColumnChunk, Vec)]) -> Result { + columns + .get(0) + .map(|(_, specs)| { + let mut num_rows = 0; + specs + .iter() + .filter(|x| is_data_page(x)) + .try_for_each(|spec| { + num_rows += spec.num_rows.ok_or_else(|| { + ParquetError::OutOfSpec( + "All data pages must declare the number of rows on it".to_string(), + ) + })? as i64; + Result::Ok(()) + })?; + Result::Ok(num_rows) + }) + .unwrap_or(Ok(0)) +} + pub fn write_row_group< 'a, W, @@ -82,24 +103,7 @@ where .collect::>>()?; let bytes_written = offset - initial; - let num_rows = columns - .get(0) - .map(|(_, specs)| { - let mut num_rows = 0; - specs - .iter() - .filter(|x| is_data_page(x)) - .try_for_each(|spec| { - num_rows += spec.num_rows.ok_or_else(|| { - ParquetError::OutOfSpec( - "All data pages must declare a number of rows on it".to_string(), - ) - })? as i64; - Result::Ok(()) - })?; - Result::Ok(num_rows) - }) - .unwrap_or(Ok(0))?; + let num_rows = compute_num_rows(&columns)?; // compute row group stats let file_offset = columns @@ -145,8 +149,7 @@ pub async fn write_row_group_async< descriptors: &[ColumnDescriptor], compression: Compression, columns: DynIter<'a, std::result::Result, E>>, - num_rows: usize, -) -> Result<(RowGroup, u64)> +) -> Result<(RowGroup, Vec>, u64)> where W: AsyncWrite + Unpin + Send, ParquetError: From, @@ -157,25 +160,34 @@ where let initial = offset; let mut columns = vec![]; for (descriptor, page_iter) in column_iter { - let (spec, size) = + let (column, page_specs, size) = write_column_chunk_async(writer, offset, descriptor, compression, page_iter?).await?; + offset += size; offset += size as u64; - columns.push(spec); + columns.push((column, page_specs)); } let bytes_written = offset - initial; + let num_rows = compute_num_rows(&columns)?; + // compute row group stats let file_offest = columns .get(0) - .map(|column_chunk| { + .map(|(column_chunk, _)| { ColumnOffsetsMetadata::from_column_chunk(column_chunk).calc_row_group_file_offset() }) .unwrap_or(None); let total_byte_size = columns .iter() - .map(|c| c.meta_data.as_ref().unwrap().total_compressed_size) + .map(|(c, _)| c.meta_data.as_ref().unwrap().total_uncompressed_size) .sum(); + let total_compressed_size = columns + .iter() + .map(|(c, _)| c.meta_data.as_ref().unwrap().total_compressed_size) + .sum(); + + let (columns, specs) = columns.into_iter().unzip(); Ok(( RowGroup { @@ -184,9 +196,10 @@ where num_rows: num_rows as i64, sorting_columns: None, file_offset: file_offest, - total_compressed_size: None, + total_compressed_size: Some(total_compressed_size), ordinal: None, }, + specs, bytes_written, )) } diff --git a/src/write/stream.rs b/src/write/stream.rs index a766a02ea..752950437 100644 --- a/src/write/stream.rs +++ b/src/write/stream.rs @@ -92,7 +92,7 @@ impl FileStreamer { } /// Writes a row group to the file. - pub async fn write(&mut self, row_group: RowGroupIter<'_, E>, num_rows: usize) -> Result<()> + pub async fn write(&mut self, row_group: RowGroupIter<'_, E>) -> Result<()> where ParquetError: From, E: std::error::Error, @@ -102,13 +102,12 @@ impl FileStreamer { "You must call `start` before writing the first row group".to_string(), )); } - let (group, size) = write_row_group_async( + let (group, _specs, size) = write_row_group_async( &mut self.writer, self.offset, self.schema.columns(), self.options.compression, row_group, - num_rows, ) .await?; self.offset += size; From 39fce3c61f99136f2fafc3f0f3127931cc0245d0 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Sun, 20 Mar 2022 22:01:08 +0000 Subject: [PATCH 10/16] Simpler API --- src/indexes/deserialize.rs | 35 ++++++++++++++++ src/indexes/index.rs | 71 ++++++++++++++++++++------------ src/indexes/mod.rs | 6 +++ src/indexes/read.rs | 28 +++---------- src/indexes/serialize.rs | 83 +++++++++++++++++++++++++++++++++++++ src/indexes/write.rs | 84 ++++---------------------------------- src/write/row_group.rs | 1 - tests/it/write/mod.rs | 2 + 8 files changed, 184 insertions(+), 126 deletions(-) create mode 100644 src/indexes/deserialize.rs create mode 100644 src/indexes/serialize.rs diff --git a/src/indexes/deserialize.rs b/src/indexes/deserialize.rs new file mode 100644 index 000000000..e4bce70d4 --- /dev/null +++ b/src/indexes/deserialize.rs @@ -0,0 +1,35 @@ +use std::io::Cursor; + +use parquet_format_async_temp::{thrift::protocol::TCompactInputProtocol, ColumnIndex}; + +use crate::error::ParquetError; +use crate::schema::types::{PhysicalType, PrimitiveType}; + +use super::{ByteIndex, FixedLenByteIndex, Index, NativeIndex}; + +pub fn deserialize( + data: &[u8], + primitive_type: PrimitiveType, +) -> Result>, ParquetError> { + let mut d = Cursor::new(data); + let mut prot = TCompactInputProtocol::new(&mut d); + + let index = ColumnIndex::read_from_in_protocol(&mut prot)?; + + let index = match primitive_type.physical_type { + PhysicalType::Boolean => return Ok(None), + PhysicalType::Int32 => { + Box::new(NativeIndex::::try_new(index, primitive_type)?) as Box + } + PhysicalType::Int64 => Box::new(NativeIndex::::try_new(index, primitive_type)?), + PhysicalType::Int96 => Box::new(NativeIndex::<[u32; 3]>::try_new(index, primitive_type)?), + PhysicalType::Float => Box::new(NativeIndex::::try_new(index, primitive_type)?), + PhysicalType::Double => Box::new(NativeIndex::::try_new(index, primitive_type)?), + PhysicalType::ByteArray => Box::new(ByteIndex::try_new(index, primitive_type)?), + PhysicalType::FixedLenByteArray(_) => { + Box::new(FixedLenByteIndex::try_new(index, primitive_type)?) + } + }; + + Ok(Some(index)) +} diff --git a/src/indexes/index.rs b/src/indexes/index.rs index 1f6c1cff0..7f117ad8b 100644 --- a/src/indexes/index.rs +++ b/src/indexes/index.rs @@ -3,8 +3,12 @@ use std::any::Any; use parquet_format_async_temp::ColumnIndex; use crate::parquet_bridge::BoundaryOrder; +use crate::schema::types::PrimitiveType; use crate::{error::ParquetError, schema::types::PhysicalType, types::NativeType}; +/// Trait object representing a [`ColumnIndex`] in Rust's native format. +/// +/// See [`NativeIndex`], [`ByteIndex`] and [`FixedLenByteIndex`] for concrete implementations. pub trait Index: Send + Sync + std::fmt::Debug { fn as_any(&self) -> &dyn Any; @@ -65,25 +69,20 @@ fn equal(lhs: &dyn Index, rhs: &dyn Index) -> bool { /// An index of a column of [`NativeType`] physical representation #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct NativeIndex { + /// The primitive type + pub primitive_type: PrimitiveType, + /// The indexes, one item per page pub indexes: Vec>, + /// the order pub boundary_order: BoundaryOrder, } -/// The index of a page, containing the min and max values of the page. -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub struct PageIndex { - /// The minimum value in the page. It is None when all values are null - pub min: Option, - /// The maximum value in the page. It is None when all values are null - pub max: Option, - /// The number of null values in the page - pub null_count: Option, -} - -impl TryFrom for NativeIndex { - type Error = ParquetError; - - fn try_from(index: ColumnIndex) -> Result { +impl NativeIndex { + /// Creates a new [`NativeIndex`] + pub(crate) fn try_new( + index: ColumnIndex, + primitive_type: PrimitiveType, + ) -> Result { let len = index.min_values.len(); let null_counts = index @@ -114,12 +113,24 @@ impl TryFrom for NativeIndex { .collect::, ParquetError>>()?; Ok(Self { + primitive_type, indexes, boundary_order: index.boundary_order.try_into()?, }) } } +/// The index of a page, containing the min and max values of the page. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct PageIndex { + /// The minimum value in the page. It is None when all values are null + pub min: Option, + /// The maximum value in the page. It is None when all values are null + pub max: Option, + /// The number of null values in the page + pub null_count: Option, +} + impl Index for NativeIndex { fn as_any(&self) -> &dyn Any { self @@ -133,14 +144,18 @@ impl Index for NativeIndex { /// An index of a column of bytes physical type #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct ByteIndex { + /// The [`PrimitiveType`]. + pub primitive_type: PrimitiveType, + /// The indexes, one item per page pub indexes: Vec>>, pub boundary_order: BoundaryOrder, } -impl TryFrom for ByteIndex { - type Error = ParquetError; - - fn try_from(index: ColumnIndex) -> Result { +impl ByteIndex { + pub(crate) fn try_new( + index: ColumnIndex, + primitive_type: PrimitiveType, + ) -> Result { let len = index.min_values.len(); let null_counts = index @@ -169,6 +184,7 @@ impl TryFrom for ByteIndex { .collect::, ParquetError>>()?; Ok(Self { + primitive_type, indexes, boundary_order: index.boundary_order.try_into()?, }) @@ -188,15 +204,18 @@ impl Index for ByteIndex { /// An index of a column of fixed len byte physical type #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct FixedLenByteIndex { - pub type_: PhysicalType, + /// The [`PrimitiveType`]. + pub primitive_type: PrimitiveType, + /// The indexes, one item per page pub indexes: Vec>>, pub boundary_order: BoundaryOrder, } -impl TryFrom<(ColumnIndex, i32)> for FixedLenByteIndex { - type Error = ParquetError; - - fn try_from((index, size): (ColumnIndex, i32)) -> Result { +impl FixedLenByteIndex { + pub(crate) fn try_new( + index: ColumnIndex, + primitive_type: PrimitiveType, + ) -> Result { let len = index.min_values.len(); let null_counts = index @@ -225,7 +244,7 @@ impl TryFrom<(ColumnIndex, i32)> for FixedLenByteIndex { .collect::, ParquetError>>()?; Ok(Self { - type_: PhysicalType::FixedLenByteArray(size), + primitive_type, indexes, boundary_order: index.boundary_order.try_into()?, }) @@ -238,6 +257,6 @@ impl Index for FixedLenByteIndex { } fn physical_type(&self) -> &PhysicalType { - &self.type_ + &self.primitive_type.physical_type } } diff --git a/src/indexes/mod.rs b/src/indexes/mod.rs index 64648c3e7..c4f46e0bf 100644 --- a/src/indexes/mod.rs +++ b/src/indexes/mod.rs @@ -1,6 +1,8 @@ +mod deserialize; mod index; mod intervals; pub(crate) mod read; +mod serialize; pub(crate) mod write; pub use crate::parquet_bridge::BoundaryOrder; @@ -13,9 +15,12 @@ pub use intervals::{compute_rows, select_pages, FilteredPage, Interval}; mod tests { use super::*; + use crate::schema::types::{PhysicalType, PrimitiveType}; + #[test] fn test_basic() { let index = NativeIndex { + primitive_type: PrimitiveType::from_physical("c1".to_string(), PhysicalType::Int32), indexes: vec![PageIndex { min: Some(0i32), max: Some(10), @@ -40,6 +45,7 @@ mod tests { fn test_multiple() { // two pages let index = ByteIndex { + primitive_type: PrimitiveType::from_physical("c1".to_string(), PhysicalType::ByteArray), indexes: vec![ PageIndex { min: Some(vec![0]), diff --git a/src/indexes/read.rs b/src/indexes/read.rs index 83fcb08a7..59f51a7ab 100644 --- a/src/indexes/read.rs +++ b/src/indexes/read.rs @@ -2,16 +2,16 @@ use std::convert::TryInto; use std::io::{Cursor, Read, Seek, SeekFrom}; use parquet_format_async_temp::{ - thrift::protocol::TCompactInputProtocol, ColumnIndex, OffsetIndex, PageLocation, + thrift::protocol::TCompactInputProtocol, OffsetIndex, PageLocation, }; use crate::error::ParquetError; use crate::metadata::ColumnChunkMetaData; -use crate::schema::types::PhysicalType; -use super::{ByteIndex, FixedLenByteIndex, Index, NativeIndex}; +use super::deserialize::deserialize; +use super::Index; -/// Read the [`ColumnIndex`] from the [`ColumnChunk`], if available. +/// Read the column index from the [`ColumnChunkMetaData`] if available and deserializes it into [`Index`]. pub fn read_column_index( reader: &mut R, chunk: &ColumnChunkMetaData, @@ -32,24 +32,8 @@ pub fn read_column_index( let mut data = vec![0; length]; reader.read_exact(&mut data)?; - let mut d = Cursor::new(&data); - let mut prot = TCompactInputProtocol::new(&mut d); - - let index = ColumnIndex::read_from_in_protocol(&mut prot)?; - let index = match chunk.descriptor().descriptor.primitive_type.physical_type { - PhysicalType::Boolean => return Ok(None), - PhysicalType::Int32 => Box::new(NativeIndex::::try_from(index)?) as Box, - PhysicalType::Int64 => Box::new(NativeIndex::::try_from(index)?) as _, - PhysicalType::Int96 => Box::new(NativeIndex::<[u32; 3]>::try_from(index)?) as _, - PhysicalType::Float => Box::new(NativeIndex::::try_from(index)?), - PhysicalType::Double => Box::new(NativeIndex::::try_from(index)?), - PhysicalType::ByteArray => Box::new(ByteIndex::try_from(index)?), - PhysicalType::FixedLenByteArray(size) => { - Box::new(FixedLenByteIndex::try_from((index, size))?) - } - }; - - Ok(Some(index)) + let primitive_type = chunk.descriptor().descriptor.primitive_type.clone(); + deserialize(&data, primitive_type) } /// Read [`PageLocation`]s from the [`ColumnChunk`], if available. diff --git a/src/indexes/serialize.rs b/src/indexes/serialize.rs new file mode 100644 index 000000000..2f494613f --- /dev/null +++ b/src/indexes/serialize.rs @@ -0,0 +1,83 @@ +use parquet_format_async_temp::BoundaryOrder; +use parquet_format_async_temp::ColumnIndex; +use parquet_format_async_temp::OffsetIndex; +use parquet_format_async_temp::PageLocation; + +use crate::error::{ParquetError, Result}; +pub use crate::metadata::KeyValue; +use crate::statistics::serialize_statistics; + +use crate::write::page::{is_data_page, PageWriteSpec}; + +pub fn serialize_column_index(pages: &[PageWriteSpec]) -> Result { + let mut null_pages = Vec::with_capacity(pages.len()); + let mut min_values = Vec::with_capacity(pages.len()); + let mut max_values = Vec::with_capacity(pages.len()); + let mut null_counts = Vec::with_capacity(pages.len()); + + pages + .iter() + .filter(|x| is_data_page(x)) + .try_for_each(|spec| { + if let Some(stats) = &spec.statistics { + let stats = serialize_statistics(stats.as_ref()); + + let null_count = stats.null_count.ok_or_else(|| { + ParquetError::OutOfSpec("null count of a page is required".to_string()) + })?; + + null_counts.push(null_count); + if null_count as usize == spec.num_values { + min_values.push(vec![0]); + max_values.push(vec![0]); + null_pages.push(true) + } else { + min_values.push(stats.min_value.ok_or_else(|| { + ParquetError::OutOfSpec("min value of a page is required".to_string()) + })?); + max_values.push(stats.max_value.ok_or_else(|| { + ParquetError::OutOfSpec("max value of a page is required".to_string()) + })?); + null_pages.push(false) + }; + + Result::Ok(()) + } else { + Err(ParquetError::OutOfSpec( + "options were set to write statistics but some pages miss them".to_string(), + )) + } + })?; + Ok(ColumnIndex { + null_pages, + min_values, + max_values, + boundary_order: BoundaryOrder::UNORDERED, + null_counts: Some(null_counts), + }) +} + +pub fn serialize_offset_index(pages: &[PageWriteSpec]) -> Result { + let mut first_row_index = 0; + let page_locations = pages + .iter() + .filter(|x| is_data_page(x)) + .map(|spec| { + let location = PageLocation { + offset: spec.offset.try_into()?, + compressed_page_size: spec.bytes_written.try_into()?, + first_row_index, + }; + let num_rows = spec.num_rows.ok_or_else(|| { + ParquetError::OutOfSpec( + "options were set to write statistics but some data pages miss number of rows" + .to_string(), + ) + })?; + first_row_index += num_rows as i64; + Ok(location) + }) + .collect::>>()?; + + Ok(OffsetIndex { page_locations }) +} diff --git a/src/indexes/write.rs b/src/indexes/write.rs index 26974e5a0..240309761 100644 --- a/src/indexes/write.rs +++ b/src/indexes/write.rs @@ -1,92 +1,22 @@ use std::io::Write; -use parquet_format_async_temp::BoundaryOrder; -use parquet_format_async_temp::ColumnIndex; - use parquet_format_async_temp::thrift::protocol::TCompactOutputProtocol; -use parquet_format_async_temp::OffsetIndex; -use parquet_format_async_temp::PageLocation; -use crate::error::{ParquetError, Result}; +use crate::error::Result; pub use crate::metadata::KeyValue; -use crate::statistics::serialize_statistics; - -use crate::write::page::{is_data_page, PageWriteSpec}; - -pub fn write_column_index(writer: &mut W, pages: &[PageWriteSpec]) -> Result { - let mut null_pages = Vec::with_capacity(pages.len()); - let mut min_values = Vec::with_capacity(pages.len()); - let mut max_values = Vec::with_capacity(pages.len()); - let mut null_counts = Vec::with_capacity(pages.len()); - - pages - .iter() - .filter(|x| is_data_page(x)) - .try_for_each(|spec| { - if let Some(stats) = &spec.statistics { - let stats = serialize_statistics(stats.as_ref()); - let null_count = stats.null_count.ok_or_else(|| { - ParquetError::OutOfSpec("null count of a page is required".to_string()) - })?; +use crate::write::page::PageWriteSpec; - null_counts.push(null_count); - if null_count as usize == spec.num_values { - min_values.push(vec![0]); - max_values.push(vec![0]); - null_pages.push(true) - } else { - min_values.push(stats.min_value.ok_or_else(|| { - ParquetError::OutOfSpec("min value of a page is required".to_string()) - })?); - max_values.push(stats.max_value.ok_or_else(|| { - ParquetError::OutOfSpec("max value of a page is required".to_string()) - })?); - null_pages.push(false) - }; +use super::serialize::{serialize_column_index, serialize_offset_index}; - Result::Ok(()) - } else { - Err(ParquetError::OutOfSpec( - "options were set to write statistics but some pages miss them".to_string(), - )) - } - })?; - let index = ColumnIndex { - null_pages, - min_values, - max_values, - boundary_order: BoundaryOrder::UNORDERED, - null_counts: Some(null_counts), - }; +pub fn write_column_index(writer: &mut W, pages: &[PageWriteSpec]) -> Result { + let index = serialize_column_index(pages)?; let mut protocol = TCompactOutputProtocol::new(writer); Ok(index.write_to_out_protocol(&mut protocol)? as u64) } pub fn write_offset_index(writer: &mut W, pages: &[PageWriteSpec]) -> Result { - let mut first_row_index = 0; - let page_locations = pages - .iter() - .filter(|x| is_data_page(x)) - .map(|spec| { - let location = PageLocation { - offset: spec.offset.try_into()?, - compressed_page_size: spec.bytes_written.try_into()?, - first_row_index, - }; - let num_rows = spec.num_rows.ok_or_else(|| { - ParquetError::OutOfSpec( - "options were set to write statistics but some data pages miss number of rows" - .to_string(), - ) - })?; - first_row_index += num_rows as i64; - Ok(location) - }) - .collect::>>()?; - - let offset_index = OffsetIndex { page_locations }; - + let index = serialize_offset_index(pages)?; let mut protocol = TCompactOutputProtocol::new(&mut *writer); - Ok(offset_index.write_to_out_protocol(&mut protocol)? as u64) + Ok(index.write_to_out_protocol(&mut protocol)? as u64) } diff --git a/src/write/row_group.rs b/src/write/row_group.rs index 50f5221a7..9f6585191 100644 --- a/src/write/row_group.rs +++ b/src/write/row_group.rs @@ -163,7 +163,6 @@ where let (column, page_specs, size) = write_column_chunk_async(writer, offset, descriptor, compression, page_iter?).await?; offset += size; - offset += size as u64; columns.push((column, page_specs)); } let bytes_written = offset - initial; diff --git a/tests/it/write/mod.rs b/tests/it/write/mod.rs index d1ffda9a9..435215164 100644 --- a/tests/it/write/mod.rs +++ b/tests/it/write/mod.rs @@ -8,6 +8,7 @@ use parquet2::error::Result; use parquet2::indexes::{BoundaryOrder, Index, NativeIndex, PageIndex, PageLocation}; use parquet2::metadata::SchemaDescriptor; use parquet2::read::{read_column_index, read_metadata, read_page_locations}; +use parquet2::schema::types::{PhysicalType, PrimitiveType}; use parquet2::statistics::Statistics; use parquet2::write::{Compressor, DynIter, DynStreamingIterator, FileWriter, Version}; use parquet2::{metadata::Descriptor, page::EncodedPage, write::WriteOptions}; @@ -236,6 +237,7 @@ fn indexes() -> Result<()> { }, ]; let expected_index = Box::new(NativeIndex:: { + primitive_type: PrimitiveType::from_physical("col".to_string(), PhysicalType::Int32), indexes: vec![ PageIndex { min: Some(0), From 1953766b436914fdaeb6e43d74bf35ced2c48cf8 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Mon, 21 Mar 2022 16:34:18 +0000 Subject: [PATCH 11/16] Improved reading of indexes --- examples/read_metadata.rs | 26 +++---- src/indexes/read.rs | 148 ++++++++++++++++++++++++++++---------- src/read/mod.rs | 2 +- tests/it/write/mod.rs | 18 ++--- 4 files changed, 133 insertions(+), 61 deletions(-) diff --git a/examples/read_metadata.rs b/examples/read_metadata.rs index 025cfd229..e20e45eca 100644 --- a/examples/read_metadata.rs +++ b/examples/read_metadata.rs @@ -51,25 +51,21 @@ fn main() -> Result<()> { // ANCHOR: column_metadata let row_group = 0; let column = 0; - let column_metadata = metadata.row_groups[row_group].column(column); + let columns = metadata.row_groups[row_group].columns(); + let column_metadata = &columns[column]; // ANCHOR_END: column_metadata // ANCHOR: column_index - // read the column index + // read the column indexes of every column use parquet2::read; - let index = read::read_column_index(&mut reader, column_metadata)?; - if let Some(index) = index { - // these are the minimum and maximum within each page, which can be used - // to skip pages. - println!("{index:?}"); - } - - // read the offset index containing page locations - let maybe_pages = read::read_page_locations(&mut reader, column_metadata)?; - if let Some(pages) = maybe_pages { - // there are page locations in the file - println!("{pages:?}"); - } + let index = read::read_columns_indexes(&mut reader, columns)?; + // these are the minimum and maximum within each page, which can be used + // to skip pages. + println!("{index:?}"); + + // read the offset indexes containing page locations of every column + let pages = read::read_pages_locations(&mut reader, columns)?; + println!("{pages:?}"); // ANCHOR_END: column_index // ANCHOR: statistics diff --git a/src/indexes/read.rs b/src/indexes/read.rs index 59f51a7ab..b4842bd51 100644 --- a/src/indexes/read.rs +++ b/src/indexes/read.rs @@ -1,6 +1,7 @@ use std::convert::TryInto; use std::io::{Cursor, Read, Seek, SeekFrom}; +use parquet_format_async_temp::ColumnChunk; use parquet_format_async_temp::{ thrift::protocol::TCompactInputProtocol, OffsetIndex, PageLocation, }; @@ -11,55 +12,130 @@ use crate::metadata::ColumnChunkMetaData; use super::deserialize::deserialize; use super::Index; -/// Read the column index from the [`ColumnChunkMetaData`] if available and deserializes it into [`Index`]. -pub fn read_column_index( - reader: &mut R, - chunk: &ColumnChunkMetaData, -) -> Result>, ParquetError> { - let metadata = chunk.column_chunk(); - let (offset, length): (u64, usize) = if let Some(offset) = metadata.column_index_offset { - let length = metadata.column_index_length.ok_or_else(|| { - ParquetError::OutOfSpec( - "The column length must exist if column offset exists".to_string(), - ) - })?; - (offset.try_into()?, length.try_into()?) +fn prepare_read Option, G: Fn(&ColumnChunk) -> Option>( + chunks: &[ColumnChunkMetaData], + get_offset: F, + get_length: G, +) -> Result<(u64, Vec), ParquetError> { + // c1: [start, length] + // ... + // cN: [start, length] + + let first_chunk = if let Some(chunk) = chunks.first() { + chunk + } else { + return Ok((0, vec![])); + }; + let metadata = first_chunk.column_chunk(); + + let offset: u64 = if let Some(offset) = get_offset(metadata) { + offset.try_into()? } else { - return Ok(None); + return Ok((0, vec![])); }; + let lengths = chunks + .iter() + .map(|x| get_length(x.column_chunk())) + .map(|maybe_length| { + let index_length = maybe_length.ok_or_else(|| { + ParquetError::OutOfSpec( + "The column length must exist if column offset exists".to_string(), + ) + })?; + + Ok(index_length.try_into()?) + }) + .collect::, ParquetError>>()?; + + Ok((offset, lengths)) +} + +fn prepare_column_index_read( + chunks: &[ColumnChunkMetaData], +) -> Result<(u64, Vec), ParquetError> { + // c1: [start, length] + // ... + // cN: [start, length] + prepare_read(chunks, |x| x.column_index_offset, |x| x.column_index_length) +} + +fn prepare_offset_index_read( + chunks: &[ColumnChunkMetaData], +) -> Result<(u64, Vec), ParquetError> { + // c1: [start, length] + // ... + // cN: [start, length] + prepare_read(chunks, |x| x.offset_index_offset, |x| x.offset_index_length) +} + +fn deserialize_column_indexes( + chunks: &[ColumnChunkMetaData], + data: &[u8], + lengths: Vec, +) -> Result>>, ParquetError> { + let mut start = 0; + let data = lengths.into_iter().map(|length| { + let r = &data[start..length]; + start += length; + r + }); + + chunks + .iter() + .zip(data) + .map(|(chunk, data)| { + let primitive_type = chunk.descriptor().descriptor.primitive_type.clone(); + deserialize(data, primitive_type) + }) + .collect() +} + +/// Reads the column indexes of all [`ColumnChunkMetaData`] and deserializes them into [`Index`]. +/// Returns an empty vector if indexes are not available +pub fn read_columns_indexes( + reader: &mut R, + chunks: &[ColumnChunkMetaData], +) -> Result>>, ParquetError> { + let (offset, lengths) = prepare_column_index_read(chunks)?; + + let length = lengths.iter().sum::(); + reader.seek(SeekFrom::Start(offset))?; let mut data = vec![0; length]; reader.read_exact(&mut data)?; - let primitive_type = chunk.descriptor().descriptor.primitive_type.clone(); - deserialize(&data, primitive_type) + deserialize_column_indexes(chunks, &data, lengths) +} + +fn deserialize_page_locations( + data: &[u8], + column_number: usize, +) -> Result>, ParquetError> { + let mut d = Cursor::new(data); + + (0..column_number) + .map(|_| { + let mut prot = TCompactInputProtocol::new(&mut d); + let offset = OffsetIndex::read_from_in_protocol(&mut prot)?; + Ok(offset.page_locations) + }) + .collect() } -/// Read [`PageLocation`]s from the [`ColumnChunk`], if available. -pub fn read_page_locations( +/// Read [`PageLocation`]s from the [`ColumnChunkMetaData`]s. +/// Returns an empty vector if indexes are not available +pub fn read_pages_locations( reader: &mut R, - chunk: &ColumnChunkMetaData, -) -> Result>, ParquetError> { - let (offset, length): (u64, usize) = - if let Some(offset) = chunk.column_chunk().offset_index_offset { - let length = chunk.column_chunk().offset_index_length.ok_or_else(|| { - ParquetError::OutOfSpec( - "The column length must exist if column offset exists".to_string(), - ) - })?; - (offset.try_into()?, length.try_into()?) - } else { - return Ok(None); - }; + chunks: &[ColumnChunkMetaData], +) -> Result>, ParquetError> { + let (offset, lengths) = prepare_offset_index_read(chunks)?; + + let length = lengths.iter().sum::(); reader.seek(SeekFrom::Start(offset))?; let mut data = vec![0; length]; reader.read_exact(&mut data)?; - let mut d = Cursor::new(&data); - let mut prot = TCompactInputProtocol::new(&mut d); - let offset = OffsetIndex::read_from_in_protocol(&mut prot)?; - - Ok(Some(offset.page_locations)) + deserialize_page_locations(&data, chunks.len()) } diff --git a/src/read/mod.rs b/src/read/mod.rs index f8af9dab1..d5216c091 100644 --- a/src/read/mod.rs +++ b/src/read/mod.rs @@ -24,7 +24,7 @@ use crate::page::CompressedDataPage; use crate::schema::types::ParquetType; use crate::{error::Result, metadata::FileMetaData}; -pub use crate::indexes::read::{read_column_index, read_page_locations}; +pub use crate::indexes::read::{read_columns_indexes, read_pages_locations}; /// Filters row group metadata to only those row groups, /// for which the predicate function returns true diff --git a/tests/it/write/mod.rs b/tests/it/write/mod.rs index 435215164..e3223ac0b 100644 --- a/tests/it/write/mod.rs +++ b/tests/it/write/mod.rs @@ -7,7 +7,7 @@ use parquet2::compression::Compression; use parquet2::error::Result; use parquet2::indexes::{BoundaryOrder, Index, NativeIndex, PageIndex, PageLocation}; use parquet2::metadata::SchemaDescriptor; -use parquet2::read::{read_column_index, read_metadata, read_page_locations}; +use parquet2::read::{read_columns_indexes, read_metadata, read_pages_locations}; use parquet2::schema::types::{PhysicalType, PrimitiveType}; use parquet2::statistics::Statistics; use parquet2::write::{Compressor, DynIter, DynStreamingIterator, FileWriter, Version}; @@ -222,9 +222,9 @@ fn indexes() -> Result<()> { let metadata = read_metadata(&mut reader)?; - let column_metadata = &metadata.row_groups[0].columns()[0]; + let columns = &metadata.row_groups[0].columns(); - let expected_page_locations = vec![ + let expected_page_locations = vec![vec![ PageLocation { offset: 4, compressed_page_size: 63, @@ -235,8 +235,8 @@ fn indexes() -> Result<()> { compressed_page_size: 47, first_row_index: array1.len() as i64, }, - ]; - let expected_index = Box::new(NativeIndex:: { + ]]; + let expected_index = vec![Some(Box::new(NativeIndex:: { primitive_type: PrimitiveType::from_physical("col".to_string(), PhysicalType::Int32), indexes: vec![ PageIndex { @@ -251,12 +251,12 @@ fn indexes() -> Result<()> { }, ], boundary_order: BoundaryOrder::Unordered, - }) as Box; + }) as Box)]; - let index = read_column_index(&mut reader, column_metadata)?.expect("column index"); - assert_eq!(&index, &expected_index); + let indexes = read_columns_indexes(&mut reader, columns)?; + assert_eq!(&indexes, &expected_index); - let pages = read_page_locations(&mut reader, column_metadata)?.expect("offset index"); + let pages = read_pages_locations(&mut reader, columns)?; assert_eq!(pages, expected_page_locations); Ok(()) From 59060d011b761543e6f002e1b62b0812803d74e1 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Mon, 21 Mar 2022 16:45:53 +0000 Subject: [PATCH 12/16] Internal move of files --- src/indexes/mod.rs | 4 ---- src/{ => read}/indexes/deserialize.rs | 2 +- src/read/indexes/mod.rs | 4 ++++ src/{ => read}/indexes/read.rs | 2 +- src/read/mod.rs | 3 ++- src/write/file.rs | 2 +- src/write/indexes/mod.rs | 4 ++++ src/{ => write}/indexes/serialize.rs | 0 src/{ => write}/indexes/write.rs | 0 src/write/mod.rs | 1 + 10 files changed, 14 insertions(+), 8 deletions(-) rename src/{ => read}/indexes/deserialize.rs (94%) create mode 100644 src/read/indexes/mod.rs rename src/{ => read}/indexes/read.rs (99%) create mode 100644 src/write/indexes/mod.rs rename src/{ => write}/indexes/serialize.rs (100%) rename src/{ => write}/indexes/write.rs (100%) diff --git a/src/indexes/mod.rs b/src/indexes/mod.rs index c4f46e0bf..cbd61ad71 100644 --- a/src/indexes/mod.rs +++ b/src/indexes/mod.rs @@ -1,9 +1,5 @@ -mod deserialize; mod index; mod intervals; -pub(crate) mod read; -mod serialize; -pub(crate) mod write; pub use crate::parquet_bridge::BoundaryOrder; pub use parquet_format_async_temp::PageLocation; diff --git a/src/indexes/deserialize.rs b/src/read/indexes/deserialize.rs similarity index 94% rename from src/indexes/deserialize.rs rename to src/read/indexes/deserialize.rs index e4bce70d4..14ebb9d2f 100644 --- a/src/indexes/deserialize.rs +++ b/src/read/indexes/deserialize.rs @@ -5,7 +5,7 @@ use parquet_format_async_temp::{thrift::protocol::TCompactInputProtocol, ColumnI use crate::error::ParquetError; use crate::schema::types::{PhysicalType, PrimitiveType}; -use super::{ByteIndex, FixedLenByteIndex, Index, NativeIndex}; +use crate::indexes::{ByteIndex, FixedLenByteIndex, Index, NativeIndex}; pub fn deserialize( data: &[u8], diff --git a/src/read/indexes/mod.rs b/src/read/indexes/mod.rs new file mode 100644 index 000000000..1e1919c84 --- /dev/null +++ b/src/read/indexes/mod.rs @@ -0,0 +1,4 @@ +mod deserialize; +mod read; + +pub use read::*; diff --git a/src/indexes/read.rs b/src/read/indexes/read.rs similarity index 99% rename from src/indexes/read.rs rename to src/read/indexes/read.rs index b4842bd51..fdbb34224 100644 --- a/src/indexes/read.rs +++ b/src/read/indexes/read.rs @@ -7,10 +7,10 @@ use parquet_format_async_temp::{ }; use crate::error::ParquetError; +use crate::indexes::Index; use crate::metadata::ColumnChunkMetaData; use super::deserialize::deserialize; -use super::Index; fn prepare_read Option, G: Fn(&ColumnChunk) -> Option>( chunks: &[ColumnChunkMetaData], diff --git a/src/read/mod.rs b/src/read/mod.rs index d5216c091..d1235a34f 100644 --- a/src/read/mod.rs +++ b/src/read/mod.rs @@ -1,4 +1,5 @@ mod compression; +mod indexes; pub mod levels; mod metadata; mod page; @@ -24,7 +25,7 @@ use crate::page::CompressedDataPage; use crate::schema::types::ParquetType; use crate::{error::Result, metadata::FileMetaData}; -pub use crate::indexes::read::{read_columns_indexes, read_pages_locations}; +pub use indexes::{read_columns_indexes, read_pages_locations}; /// Filters row group metadata to only those row groups, /// for which the predicate function returns true diff --git a/src/write/file.rs b/src/write/file.rs index 2fabd0099..246eb2121 100644 --- a/src/write/file.rs +++ b/src/write/file.rs @@ -6,13 +6,13 @@ use parquet_format_async_temp::thrift::protocol::TCompactOutputProtocol; use parquet_format_async_temp::thrift::protocol::TOutputProtocol; use parquet_format_async_temp::RowGroup; -use crate::indexes::write::{write_column_index, write_offset_index}; use crate::{ error::{ParquetError, Result}, metadata::SchemaDescriptor, FOOTER_SIZE, PARQUET_MAGIC, }; +use super::indexes::{write_column_index, write_offset_index}; use super::page::PageWriteSpec; use super::{row_group::write_row_group, RowGroupIter, WriteOptions}; diff --git a/src/write/indexes/mod.rs b/src/write/indexes/mod.rs new file mode 100644 index 000000000..9f413a15d --- /dev/null +++ b/src/write/indexes/mod.rs @@ -0,0 +1,4 @@ +mod serialize; +mod write; + +pub use write::*; diff --git a/src/indexes/serialize.rs b/src/write/indexes/serialize.rs similarity index 100% rename from src/indexes/serialize.rs rename to src/write/indexes/serialize.rs diff --git a/src/indexes/write.rs b/src/write/indexes/write.rs similarity index 100% rename from src/indexes/write.rs rename to src/write/indexes/write.rs diff --git a/src/write/mod.rs b/src/write/mod.rs index 2b9b27a72..dd41e4f44 100644 --- a/src/write/mod.rs +++ b/src/write/mod.rs @@ -1,6 +1,7 @@ mod column_chunk; mod compression; mod file; +mod indexes; pub(crate) mod page; mod row_group; pub(self) mod statistics; From 2d5e48230b4962bf5037bc40cac8cbe997884815 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Tue, 22 Mar 2022 06:46:44 +0000 Subject: [PATCH 13/16] Added boolean index --- src/indexes/index.rs | 56 +++++++++++++++++++++++++++++++++ src/indexes/mod.rs | 2 +- src/read/indexes/deserialize.rs | 12 +++---- src/read/indexes/read.rs | 12 ++----- tests/it/write/mod.rs | 4 +-- 5 files changed, 67 insertions(+), 19 deletions(-) diff --git a/src/indexes/index.rs b/src/indexes/index.rs index 7f117ad8b..0dca00a40 100644 --- a/src/indexes/index.rs +++ b/src/indexes/index.rs @@ -260,3 +260,59 @@ impl Index for FixedLenByteIndex { &self.primitive_type.physical_type } } + +/// An index of a column of boolean physical type +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct BooleanIndex { + /// The indexes, one item per page + pub indexes: Vec>, + pub boundary_order: BoundaryOrder, +} + +impl BooleanIndex { + pub(crate) fn try_new(index: ColumnIndex) -> Result { + let len = index.min_values.len(); + + let null_counts = index + .null_counts + .map(|x| x.into_iter().map(Some).collect::>()) + .unwrap_or_else(|| vec![None; len]); + + let indexes = index + .min_values + .into_iter() + .zip(index.max_values.into_iter()) + .zip(index.null_pages.into_iter()) + .zip(null_counts.into_iter()) + .map(|(((min, max), is_null), null_count)| { + let (min, max) = if is_null { + (None, None) + } else { + let min = min[0] == 1; + let max = max[0] == 1; + (Some(min), Some(max)) + }; + Ok(PageIndex { + min, + max, + null_count, + }) + }) + .collect::, ParquetError>>()?; + + Ok(Self { + indexes, + boundary_order: index.boundary_order.try_into()?, + }) + } +} + +impl Index for BooleanIndex { + fn as_any(&self) -> &dyn Any { + self + } + + fn physical_type(&self) -> &PhysicalType { + &PhysicalType::Boolean + } +} diff --git a/src/indexes/mod.rs b/src/indexes/mod.rs index cbd61ad71..6af524587 100644 --- a/src/indexes/mod.rs +++ b/src/indexes/mod.rs @@ -4,7 +4,7 @@ mod intervals; pub use crate::parquet_bridge::BoundaryOrder; pub use parquet_format_async_temp::PageLocation; -pub use self::index::{ByteIndex, FixedLenByteIndex, Index, NativeIndex, PageIndex}; +pub use self::index::{BooleanIndex, ByteIndex, FixedLenByteIndex, Index, NativeIndex, PageIndex}; pub use intervals::{compute_rows, select_pages, FilteredPage, Interval}; #[cfg(test)] diff --git a/src/read/indexes/deserialize.rs b/src/read/indexes/deserialize.rs index 14ebb9d2f..453b42bd6 100644 --- a/src/read/indexes/deserialize.rs +++ b/src/read/indexes/deserialize.rs @@ -5,22 +5,20 @@ use parquet_format_async_temp::{thrift::protocol::TCompactInputProtocol, ColumnI use crate::error::ParquetError; use crate::schema::types::{PhysicalType, PrimitiveType}; -use crate::indexes::{ByteIndex, FixedLenByteIndex, Index, NativeIndex}; +use crate::indexes::{BooleanIndex, ByteIndex, FixedLenByteIndex, Index, NativeIndex}; pub fn deserialize( data: &[u8], primitive_type: PrimitiveType, -) -> Result>, ParquetError> { +) -> Result, ParquetError> { let mut d = Cursor::new(data); let mut prot = TCompactInputProtocol::new(&mut d); let index = ColumnIndex::read_from_in_protocol(&mut prot)?; let index = match primitive_type.physical_type { - PhysicalType::Boolean => return Ok(None), - PhysicalType::Int32 => { - Box::new(NativeIndex::::try_new(index, primitive_type)?) as Box - } + PhysicalType::Boolean => Box::new(BooleanIndex::try_new(index)?) as Box, + PhysicalType::Int32 => Box::new(NativeIndex::::try_new(index, primitive_type)?), PhysicalType::Int64 => Box::new(NativeIndex::::try_new(index, primitive_type)?), PhysicalType::Int96 => Box::new(NativeIndex::<[u32; 3]>::try_new(index, primitive_type)?), PhysicalType::Float => Box::new(NativeIndex::::try_new(index, primitive_type)?), @@ -31,5 +29,5 @@ pub fn deserialize( } }; - Ok(Some(index)) + Ok(index) } diff --git a/src/read/indexes/read.rs b/src/read/indexes/read.rs index fdbb34224..31382d8cd 100644 --- a/src/read/indexes/read.rs +++ b/src/read/indexes/read.rs @@ -54,18 +54,12 @@ fn prepare_read Option, G: Fn(&ColumnChunk) -> Optio fn prepare_column_index_read( chunks: &[ColumnChunkMetaData], ) -> Result<(u64, Vec), ParquetError> { - // c1: [start, length] - // ... - // cN: [start, length] prepare_read(chunks, |x| x.column_index_offset, |x| x.column_index_length) } fn prepare_offset_index_read( chunks: &[ColumnChunkMetaData], ) -> Result<(u64, Vec), ParquetError> { - // c1: [start, length] - // ... - // cN: [start, length] prepare_read(chunks, |x| x.offset_index_offset, |x| x.offset_index_length) } @@ -73,10 +67,10 @@ fn deserialize_column_indexes( chunks: &[ColumnChunkMetaData], data: &[u8], lengths: Vec, -) -> Result>>, ParquetError> { +) -> Result>, ParquetError> { let mut start = 0; let data = lengths.into_iter().map(|length| { - let r = &data[start..length]; + let r = &data[start..start + length]; start += length; r }); @@ -96,7 +90,7 @@ fn deserialize_column_indexes( pub fn read_columns_indexes( reader: &mut R, chunks: &[ColumnChunkMetaData], -) -> Result>>, ParquetError> { +) -> Result>, ParquetError> { let (offset, lengths) = prepare_column_index_read(chunks)?; let length = lengths.iter().sum::(); diff --git a/tests/it/write/mod.rs b/tests/it/write/mod.rs index e3223ac0b..d84c27eeb 100644 --- a/tests/it/write/mod.rs +++ b/tests/it/write/mod.rs @@ -236,7 +236,7 @@ fn indexes() -> Result<()> { first_row_index: array1.len() as i64, }, ]]; - let expected_index = vec![Some(Box::new(NativeIndex:: { + let expected_index = vec![Box::new(NativeIndex:: { primitive_type: PrimitiveType::from_physical("col".to_string(), PhysicalType::Int32), indexes: vec![ PageIndex { @@ -251,7 +251,7 @@ fn indexes() -> Result<()> { }, ], boundary_order: BoundaryOrder::Unordered, - }) as Box)]; + }) as Box]; let indexes = read_columns_indexes(&mut reader, columns)?; assert_eq!(&indexes, &expected_index); From 80c85b528d2981b5c7220d1bdef92cafc3771d32 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Tue, 22 Mar 2022 21:59:52 +0000 Subject: [PATCH 14/16] Improved API --- src/indexes/intervals.rs | 32 +++++++++++++++----------------- src/indexes/mod.rs | 16 +++------------- src/read/mod.rs | 20 -------------------- src/read/page/indexed_reader.rs | 12 ++++++------ 4 files changed, 24 insertions(+), 56 deletions(-) diff --git a/src/indexes/intervals.rs b/src/indexes/intervals.rs index 1506dac99..ce235db03 100644 --- a/src/indexes/intervals.rs +++ b/src/indexes/intervals.rs @@ -2,8 +2,6 @@ use parquet_format_async_temp::PageLocation; use crate::error::ParquetError; -use super::index::PageIndex; - /// An interval #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub struct Interval { @@ -21,7 +19,7 @@ impl Interval { } /// Returns the set of (row) intervals of the pages. -fn compute_row_page_intervals( +fn compute_page_row_intervals( locations: &[PageLocation], num_rows: u64, ) -> Result, ParquetError> { @@ -49,25 +47,25 @@ fn compute_row_page_intervals( /// Returns the set of intervals `(start, len)` containing all the /// selected rows (for a given column) -pub fn compute_rows<'a, T>( - index: &'a [PageIndex], +pub fn compute_rows( + selected: &[bool], locations: &[PageLocation], num_rows: u64, - selector: &dyn Fn(&'a PageIndex) -> bool, ) -> Result, ParquetError> { - let page_intervals = compute_row_page_intervals(locations, num_rows)?; + let page_intervals = compute_page_row_intervals(locations, num_rows)?; - Ok(index + Ok(selected .iter() .zip(page_intervals.iter().copied()) - .filter_map(|(index, page)| { - let is_selected = selector(index); - if is_selected { - Some(page) - } else { - None - } - }) + .filter_map( + |(&is_selected, page)| { + if is_selected { + Some(page) + } else { + None + } + }, + ) .collect()) } @@ -121,7 +119,7 @@ pub fn select_pages( locations: &[PageLocation], num_rows: u64, ) -> Result, ParquetError> { - let page_intervals = compute_row_page_intervals(locations, num_rows)?; + let page_intervals = compute_page_row_intervals(locations, num_rows)?; page_intervals .into_iter() diff --git a/src/indexes/mod.rs b/src/indexes/mod.rs index 6af524587..e0b80dcdf 100644 --- a/src/indexes/mod.rs +++ b/src/indexes/mod.rs @@ -15,15 +15,6 @@ mod tests { #[test] fn test_basic() { - let index = NativeIndex { - primitive_type: PrimitiveType::from_physical("c1".to_string(), PhysicalType::Int32), - indexes: vec![PageIndex { - min: Some(0i32), - max: Some(10), - null_count: Some(0), - }], - boundary_order: Default::default(), - }; let locations = &[PageLocation { offset: 100, compressed_page_size: 10, @@ -31,9 +22,7 @@ mod tests { }]; let num_rows = 10; - let selector = |_| true; - - let row_intervals = compute_rows(&index.indexes, locations, num_rows, &selector).unwrap(); + let row_intervals = compute_rows(&[true; 1], locations, num_rows).unwrap(); assert_eq!(row_intervals, vec![Interval::new(0, 10)]) } @@ -77,8 +66,9 @@ mod tests { .map(|x| x.as_slice() > &[97]) .unwrap_or(false) // no max is present => all nulls => not selected }; + let selected = index.indexes.iter().map(selector).collect::>(); - let rows = compute_rows(&index.indexes, locations, num_rows, &selector).unwrap(); + let rows = compute_rows(&selected, locations, num_rows).unwrap(); assert_eq!(rows, vec![Interval::new(5, 5)]); let pages = select_pages(&rows, locations, num_rows).unwrap(); diff --git a/src/read/mod.rs b/src/read/mod.rs index d1235a34f..44ff10a19 100644 --- a/src/read/mod.rs +++ b/src/read/mod.rs @@ -19,7 +19,6 @@ pub use page::{IndexedPageReader, PageFilter, PageReader}; pub use stream::read_metadata as read_metadata_async; use crate::error::ParquetError; -use crate::indexes::FilteredPage; use crate::metadata::{ColumnChunkMetaData, RowGroupMetaData}; use crate::page::CompressedDataPage; use crate::schema::types::ParquetType; @@ -65,25 +64,6 @@ pub fn get_page_iterator( )) } -/// Returns a new [`IndexedPageReader`] by seeking `reader` to the begining of `column_chunk`. -pub fn get_indexed_page_reader( - column_chunk: &ColumnChunkMetaData, - reader: R, - pages: Vec, - buffer: Vec, - data_buffer: Vec, -) -> Result> { - Ok(IndexedPageReader::new( - reader, - column_chunk.compression(), - column_chunk.descriptor().descriptor.clone(), - column_chunk.byte_range().0, - pages, - buffer, - data_buffer, - )) -} - /// Returns an [`Iterator`] of [`ColumnChunkMetaData`] corresponding to the columns /// from `field` at `row_group`. /// For primitive fields (e.g. `i64`), the iterator has exactly one item. diff --git a/src/read/page/indexed_reader.rs b/src/read/page/indexed_reader.rs index 11997d6e3..03151a84f 100644 --- a/src/read/page/indexed_reader.rs +++ b/src/read/page/indexed_reader.rs @@ -7,7 +7,7 @@ use std::{ use crate::{ error::ParquetError, indexes::FilteredPage, - metadata::Descriptor, + metadata::{ColumnChunkMetaData, Descriptor}, page::{CompressedDataPage, DictPage, ParquetPageHeader}, parquet_bridge::Compression, }; @@ -102,15 +102,15 @@ fn read_dict_page( } impl IndexedPageReader { + /// Returns a new [`IndexedPageReader`]. pub fn new( reader: R, - compression: Compression, - descriptor: Descriptor, - column_start: u64, + column: &ColumnChunkMetaData, pages: Vec, buffer: Vec, data_buffer: Vec, ) -> Self { + let column_start = column.byte_range().0; // a dictionary page exists iff the first data page is not at the start of // the column let dictionary = match pages.get(0) { @@ -128,8 +128,8 @@ impl IndexedPageReader { let pages = pages.into_iter().collect(); Self { reader, - compression, - descriptor, + compression: column.compression(), + descriptor: column.descriptor().descriptor.clone(), buffer, data_buffer, pages, From f67633b1e43733c6024f96ee57bf111177fbcfc4 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Tue, 22 Mar 2022 22:47:41 +0000 Subject: [PATCH 15/16] Improved API --- src/read/mod.rs | 9 +-------- src/read/page/indexed_reader.rs | 3 --- src/read/page/reader.rs | 15 ++++++++------- 3 files changed, 9 insertions(+), 18 deletions(-) diff --git a/src/read/mod.rs b/src/read/mod.rs index 44ff10a19..db1597389 100644 --- a/src/read/mod.rs +++ b/src/read/mod.rs @@ -54,14 +54,7 @@ pub fn get_page_iterator( let (col_start, _) = column_chunk.byte_range(); reader.seek(SeekFrom::Start(col_start))?; - Ok(PageReader::new( - reader, - column_chunk.num_values(), - column_chunk.compression(), - column_chunk.descriptor().descriptor.clone(), - pages_filter, - buffer, - )) + Ok(PageReader::new(reader, column_chunk, pages_filter, buffer)) } /// Returns an [`Iterator`] of [`ColumnChunkMetaData`] corresponding to the columns diff --git a/src/read/page/indexed_reader.rs b/src/read/page/indexed_reader.rs index 03151a84f..58509b8d2 100644 --- a/src/read/page/indexed_reader.rs +++ b/src/read/page/indexed_reader.rs @@ -196,9 +196,6 @@ impl Iterator for IndexedPageReader { type Item = Result; fn next(&mut self) -> Option { - // todo: check if the first page is a dictionary page and read it accordingly so that - // we can attach it to data pages - if let Some(page) = self.pages.pop_front() { match page { FilteredPage::Select { diff --git a/src/read/page/reader.rs b/src/read/page/reader.rs index 56b5b8139..4373f5eef 100644 --- a/src/read/page/reader.rs +++ b/src/read/page/reader.rs @@ -5,7 +5,7 @@ use parquet_format_async_temp::thrift::protocol::TCompactInputProtocol; use crate::compression::Compression; use crate::error::Result; -use crate::metadata::Descriptor; +use crate::metadata::{ColumnChunkMetaData, Descriptor}; use crate::page::{ read_dict_page, CompressedDataPage, DataPageHeader, DictPage, EncodedDictPage, PageType, @@ -46,21 +46,22 @@ pub struct PageReader { } impl PageReader { + /// Returns a new [`PageReader`]. + /// + /// It assumes that the reader has been `seeked` to the beginning of `column`. pub fn new( reader: R, - total_num_values: i64, - compression: Compression, - descriptor: Descriptor, + column: &ColumnChunkMetaData, pages_filter: PageFilter, buffer: Vec, ) -> Self { Self { reader, - total_num_values, - compression, + total_num_values: column.num_values(), + compression: column.compression(), seen_num_values: 0, current_dictionary: None, - descriptor, + descriptor: column.descriptor().descriptor.clone(), pages_filter, buffer, } From f67e75c6ebc54d7d63f636ac010d475bd0a964a9 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Fri, 25 Mar 2022 07:29:04 +0000 Subject: [PATCH 16/16] Added test reading indexes from spark --- src/indexes/index.rs | 5 +- tests/it/read/indexes.rs | 141 +++++++++++++++++++++++++++++++++++++++ tests/it/read/mod.rs | 1 + 3 files changed, 146 insertions(+), 1 deletion(-) create mode 100644 tests/it/read/indexes.rs diff --git a/src/indexes/index.rs b/src/indexes/index.rs index 0dca00a40..dc8bd7435 100644 --- a/src/indexes/index.rs +++ b/src/indexes/index.rs @@ -29,7 +29,10 @@ fn equal(lhs: &dyn Index, rhs: &dyn Index) -> bool { } match lhs.physical_type() { - PhysicalType::Boolean => unreachable!(), + PhysicalType::Boolean => { + lhs.as_any().downcast_ref::().unwrap() + == rhs.as_any().downcast_ref::().unwrap() + } PhysicalType::Int32 => { lhs.as_any().downcast_ref::>().unwrap() == rhs.as_any().downcast_ref::>().unwrap() diff --git a/tests/it/read/indexes.rs b/tests/it/read/indexes.rs new file mode 100644 index 000000000..8776cbc8c --- /dev/null +++ b/tests/it/read/indexes.rs @@ -0,0 +1,141 @@ +use parquet2::{ + error::ParquetError, + indexes::{ + BooleanIndex, BoundaryOrder, ByteIndex, Index, NativeIndex, PageIndex, PageLocation, + }, + read::{read_columns_indexes, read_metadata, read_pages_locations}, + schema::{ + types::{FieldInfo, LogicalType, PhysicalType, PrimitiveConvertedType, PrimitiveType}, + Repetition, + }, +}; + +/* +import pyspark.sql # 3.2.1 +spark = pyspark.sql.SparkSession.builder.getOrCreate() +spark.conf.set("parquet.bloom.filter.enabled", True) +spark.conf.set("parquet.bloom.filter.expected.ndv", 10) +spark.conf.set("parquet.bloom.filter.max.bytes", 32) + +data = [(i, f"{i}", False) for i in range(10)] +df = spark.createDataFrame(data, ["id", "string", "bool"]).repartition(1) + +df.write.parquet("bla.parquet", mode = "overwrite") +*/ +const FILE: &[u8] = &[ + 80, 65, 82, 49, 21, 0, 21, 172, 1, 21, 138, 1, 21, 169, 161, 209, 137, 5, 28, 21, 20, 21, 0, + 21, 6, 21, 8, 0, 0, 86, 24, 2, 0, 0, 0, 20, 1, 0, 13, 1, 17, 9, 1, 22, 1, 1, 0, 3, 1, 5, 12, 0, + 0, 0, 4, 1, 5, 12, 0, 0, 0, 5, 1, 5, 12, 0, 0, 0, 6, 1, 5, 12, 0, 0, 0, 7, 1, 5, 72, 0, 0, 0, + 8, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 21, 0, 21, 112, 21, 104, 21, 138, 239, 232, + 170, 15, 28, 21, 20, 21, 0, 21, 6, 21, 8, 0, 0, 56, 40, 2, 0, 0, 0, 20, 1, 1, 0, 0, 0, 48, 1, + 5, 0, 49, 1, 5, 0, 50, 1, 5, 0, 51, 1, 5, 0, 52, 1, 5, 0, 53, 1, 5, 60, 54, 1, 0, 0, 0, 55, 1, + 0, 0, 0, 56, 1, 0, 0, 0, 57, 21, 0, 21, 16, 21, 20, 21, 202, 209, 169, 227, 4, 28, 21, 20, 21, + 0, 21, 6, 21, 8, 0, 0, 8, 28, 2, 0, 0, 0, 20, 1, 0, 0, 25, 17, 2, 25, 24, 8, 0, 0, 0, 0, 0, 0, + 0, 0, 25, 24, 8, 9, 0, 0, 0, 0, 0, 0, 0, 21, 2, 25, 22, 0, 0, 25, 17, 2, 25, 24, 1, 48, 25, 24, + 1, 57, 21, 2, 25, 22, 0, 0, 25, 17, 2, 25, 24, 1, 0, 25, 24, 1, 0, 21, 2, 25, 22, 0, 0, 25, 28, + 22, 8, 21, 188, 1, 22, 0, 0, 0, 25, 28, 22, 196, 1, 21, 150, 1, 22, 0, 0, 0, 25, 28, 22, 218, + 2, 21, 66, 22, 0, 0, 0, 21, 64, 28, 28, 0, 0, 28, 28, 0, 0, 28, 28, 0, 0, 0, 24, 130, 24, 8, + 134, 8, 68, 6, 2, 101, 128, 10, 64, 2, 38, 78, 114, 1, 64, 38, 1, 192, 194, 152, 64, 70, 0, 36, + 56, 121, 64, 0, 21, 64, 28, 28, 0, 0, 28, 28, 0, 0, 28, 28, 0, 0, 0, 8, 17, 10, 29, 5, 88, 194, + 0, 35, 208, 25, 16, 70, 68, 48, 38, 17, 16, 140, 68, 98, 56, 0, 131, 4, 193, 40, 129, 161, 160, + 1, 96, 21, 64, 28, 28, 0, 0, 28, 28, 0, 0, 28, 28, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 2, 25, 76, 72, 12, 115, 112, + 97, 114, 107, 95, 115, 99, 104, 101, 109, 97, 21, 6, 0, 21, 4, 37, 2, 24, 2, 105, 100, 0, 21, + 12, 37, 2, 24, 6, 115, 116, 114, 105, 110, 103, 37, 0, 76, 28, 0, 0, 0, 21, 0, 37, 2, 24, 4, + 98, 111, 111, 108, 0, 22, 20, 25, 28, 25, 60, 38, 8, 28, 21, 4, 25, 53, 0, 6, 8, 25, 24, 2, + 105, 100, 21, 2, 22, 20, 22, 222, 1, 22, 188, 1, 38, 8, 60, 24, 8, 9, 0, 0, 0, 0, 0, 0, 0, 24, + 8, 0, 0, 0, 0, 0, 0, 0, 0, 22, 0, 40, 8, 9, 0, 0, 0, 0, 0, 0, 0, 24, 8, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 25, 28, 21, 0, 21, 0, 21, 2, 0, 22, 226, 4, 0, 22, 158, 4, 21, 22, 22, 156, 3, 21, 62, 0, + 38, 196, 1, 28, 21, 12, 25, 53, 0, 6, 8, 25, 24, 6, 115, 116, 114, 105, 110, 103, 21, 2, 22, + 20, 22, 158, 1, 22, 150, 1, 38, 196, 1, 60, 54, 0, 40, 1, 57, 24, 1, 48, 0, 25, 28, 21, 0, 21, + 0, 21, 2, 0, 22, 192, 5, 0, 22, 180, 4, 21, 24, 22, 218, 3, 21, 34, 0, 38, 218, 2, 28, 21, 0, + 25, 53, 0, 6, 8, 25, 24, 4, 98, 111, 111, 108, 21, 2, 22, 20, 22, 62, 22, 66, 38, 218, 2, 60, + 24, 1, 0, 24, 1, 0, 22, 0, 40, 1, 0, 24, 1, 0, 0, 25, 28, 21, 0, 21, 0, 21, 2, 0, 22, 158, 6, + 0, 22, 204, 4, 21, 22, 22, 252, 3, 21, 34, 0, 22, 186, 3, 22, 20, 38, 8, 22, 148, 3, 20, 0, 0, + 25, 44, 24, 24, 111, 114, 103, 46, 97, 112, 97, 99, 104, 101, 46, 115, 112, 97, 114, 107, 46, + 118, 101, 114, 115, 105, 111, 110, 24, 5, 51, 46, 50, 46, 49, 0, 24, 41, 111, 114, 103, 46, 97, + 112, 97, 99, 104, 101, 46, 115, 112, 97, 114, 107, 46, 115, 113, 108, 46, 112, 97, 114, 113, + 117, 101, 116, 46, 114, 111, 119, 46, 109, 101, 116, 97, 100, 97, 116, 97, 24, 213, 1, 123, 34, + 116, 121, 112, 101, 34, 58, 34, 115, 116, 114, 117, 99, 116, 34, 44, 34, 102, 105, 101, 108, + 100, 115, 34, 58, 91, 123, 34, 110, 97, 109, 101, 34, 58, 34, 105, 100, 34, 44, 34, 116, 121, + 112, 101, 34, 58, 34, 108, 111, 110, 103, 34, 44, 34, 110, 117, 108, 108, 97, 98, 108, 101, 34, + 58, 116, 114, 117, 101, 44, 34, 109, 101, 116, 97, 100, 97, 116, 97, 34, 58, 123, 125, 125, 44, + 123, 34, 110, 97, 109, 101, 34, 58, 34, 115, 116, 114, 105, 110, 103, 34, 44, 34, 116, 121, + 112, 101, 34, 58, 34, 115, 116, 114, 105, 110, 103, 34, 44, 34, 110, 117, 108, 108, 97, 98, + 108, 101, 34, 58, 116, 114, 117, 101, 44, 34, 109, 101, 116, 97, 100, 97, 116, 97, 34, 58, 123, + 125, 125, 44, 123, 34, 110, 97, 109, 101, 34, 58, 34, 98, 111, 111, 108, 34, 44, 34, 116, 121, + 112, 101, 34, 58, 34, 98, 111, 111, 108, 101, 97, 110, 34, 44, 34, 110, 117, 108, 108, 97, 98, + 108, 101, 34, 58, 116, 114, 117, 101, 44, 34, 109, 101, 116, 97, 100, 97, 116, 97, 34, 58, 123, + 125, 125, 93, 125, 0, 24, 74, 112, 97, 114, 113, 117, 101, 116, 45, 109, 114, 32, 118, 101, + 114, 115, 105, 111, 110, 32, 49, 46, 49, 50, 46, 50, 32, 40, 98, 117, 105, 108, 100, 32, 55, + 55, 101, 51, 48, 99, 56, 48, 57, 51, 51, 56, 54, 101, 99, 53, 50, 99, 51, 99, 102, 97, 54, 99, + 51, 52, 98, 55, 101, 102, 51, 51, 50, 49, 51, 50, 50, 99, 57, 52, 41, 25, 60, 28, 0, 0, 28, 0, + 0, 28, 0, 0, 0, 182, 2, 0, 0, 80, 65, 82, 49, +]; + +#[test] +fn test() -> Result<(), ParquetError> { + let mut reader = std::io::Cursor::new(FILE); + + let expected_index = vec![ + Box::new(NativeIndex:: { + primitive_type: PrimitiveType::from_physical("id".to_string(), PhysicalType::Int64), + indexes: vec![PageIndex { + min: Some(0), + max: Some(9), + null_count: Some(0), + }], + boundary_order: BoundaryOrder::Ascending, + }) as Box, + Box::new(ByteIndex { + primitive_type: PrimitiveType { + field_info: FieldInfo::new("string".to_string(), Repetition::Optional, None, false), + logical_type: Some(LogicalType::STRING(Default::default())), + converted_type: Some(PrimitiveConvertedType::Utf8), + physical_type: PhysicalType::ByteArray, + }, + indexes: vec![PageIndex { + min: Some(b"0".to_vec()), + max: Some(b"9".to_vec()), + null_count: Some(0), + }], + boundary_order: BoundaryOrder::Ascending, + }), + Box::new(BooleanIndex { + indexes: vec![PageIndex { + min: Some(false), + max: Some(false), + null_count: Some(0), + }], + boundary_order: BoundaryOrder::Ascending, + }), + ]; + let expected_page_locations = vec![ + vec![PageLocation { + offset: 4, + compressed_page_size: 94, + first_row_index: 0, + }], + vec![PageLocation { + offset: 98, + compressed_page_size: 75, + first_row_index: 0, + }], + vec![PageLocation { + offset: 173, + compressed_page_size: 33, + first_row_index: 0, + }], + ]; + + let metadata = read_metadata(&mut reader)?; + let columns = &metadata.row_groups[0].columns(); + + let indexes = read_columns_indexes(&mut reader, columns)?; + assert_eq!(&indexes, &expected_index); + + let pages = read_pages_locations(&mut reader, columns)?; + assert_eq!(pages, expected_page_locations); + + Ok(()) +} diff --git a/tests/it/read/mod.rs b/tests/it/read/mod.rs index ed5cadd22..e3439076f 100644 --- a/tests/it/read/mod.rs +++ b/tests/it/read/mod.rs @@ -3,6 +3,7 @@ /// but OTOH it has no external dependencies and is very familiar to Rust developers. mod binary; mod boolean; +mod indexes; mod primitive; mod primitive_nested; mod struct_;