Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Added support to write nested parquet #1007

Merged
merged 7 commits into from
May 27, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions arrow-parquet-integration-testing/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -175,15 +175,15 @@ fn main() -> Result<()> {
.fields
.iter()
.map(|x| match x.data_type() {
DataType::Dictionary(..) => Encoding::RleDictionary,
DataType::Dictionary(..) => vec![Encoding::RleDictionary],
DataType::Utf8 | DataType::LargeUtf8 => {
if args.encoding_utf8 == EncodingScheme::Delta {
vec![if args.encoding_utf8 == EncodingScheme::Delta {
Encoding::DeltaLengthByteArray
} else {
Encoding::Plain
}
}]
}
_ => Encoding::Plain,
_ => vec![Encoding::Plain],
})
.collect();

Expand Down
2 changes: 1 addition & 1 deletion benches/write_parquet.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ fn write(array: &dyn Array, encoding: Encoding) -> Result<()> {
vec![Ok(columns)].into_iter(),
&schema,
options,
vec![encoding],
vec![vec![encoding]],
)?;

let writer = vec![];
Expand Down
8 changes: 6 additions & 2 deletions examples/parquet_write.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,12 @@ fn write_batch(path: &str, schema: Schema, columns: Chunk<Arc<dyn Array>>) -> Re

let iter = vec![Ok(columns)];

let row_groups =
RowGroupIterator::try_new(iter.into_iter(), &schema, options, vec![Encoding::Plain])?;
let row_groups = RowGroupIterator::try_new(
iter.into_iter(),
&schema,
options,
vec![vec![Encoding::Plain]],
)?;

// Create a new empty file
let file = File::create(path)?;
Expand Down
2 changes: 1 addition & 1 deletion src/doc/lib.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ fn main() -> Result<()> {
vec![Ok(chunk)].into_iter(),
&schema,
options,
vec![Encoding::Plain, Encoding::Plain],
vec![vec![Encoding::Plain], vec![Encoding::Plain]],
)?;

// anything implementing `std::io::Write` works
Expand Down
21 changes: 8 additions & 13 deletions src/io/parquet/write/binary/nested.rs
Original file line number Diff line number Diff line change
@@ -1,34 +1,29 @@
use parquet2::schema::types::PrimitiveType;
use parquet2::{encoding::Encoding, page::DataPage};

use super::super::{levels, utils, WriteOptions};
use super::super::{nested, utils, WriteOptions};
use super::basic::{build_statistics, encode_plain};
use crate::io::parquet::read::schema::is_nullable;
use crate::io::parquet::write::Nested;
use crate::{
array::{Array, BinaryArray, Offset},
error::Result,
};

pub fn array_to_page<O, OO>(
pub fn array_to_page<O>(
array: &BinaryArray<O>,
options: WriteOptions,
type_: PrimitiveType,
nested: levels::NestedInfo<OO>,
nested: Vec<Nested>,
) -> Result<DataPage>
where
OO: Offset,
O: Offset,
{
let is_optional = is_nullable(&type_.field_info);

let validity = array.validity();

let mut buffer = vec![];
levels::write_rep_levels(&mut buffer, &nested, options.version)?;
let repetition_levels_byte_length = buffer.len();

levels::write_def_levels(&mut buffer, &nested, validity, is_optional, options.version)?;
let definition_levels_byte_length = buffer.len() - repetition_levels_byte_length;
let (repetition_levels_byte_length, definition_levels_byte_length) =
nested::write_rep_and_def(options.version, &nested, &mut buffer)?;

encode_plain(array, is_optional, &mut buffer);

Expand All @@ -40,8 +35,8 @@ where

utils::build_plain_page(
buffer,
levels::num_values(nested.offsets()),
nested.offsets().len().saturating_sub(1),
nested::num_values(&nested),
nested[0].len(),
array.null_count(),
repetition_levels_byte_length,
definition_levels_byte_length,
Expand Down
27 changes: 10 additions & 17 deletions src/io/parquet/write/boolean/nested.rs
Original file line number Diff line number Diff line change
@@ -1,33 +1,26 @@
use parquet2::schema::types::PrimitiveType;
use parquet2::{encoding::Encoding, page::DataPage};

use super::super::{levels, utils, WriteOptions};
use super::super::{nested, utils, WriteOptions};
use super::basic::{build_statistics, encode_plain};
use crate::io::parquet::read::schema::is_nullable;
use crate::io::parquet::write::Nested;
use crate::{
array::{Array, BooleanArray, Offset},
array::{Array, BooleanArray},
error::Result,
};

pub fn array_to_page<O>(
pub fn array_to_page(
array: &BooleanArray,
options: WriteOptions,
type_: PrimitiveType,
nested: levels::NestedInfo<O>,
) -> Result<DataPage>
where
O: Offset,
{
nested: Vec<Nested>,
) -> Result<DataPage> {
let is_optional = is_nullable(&type_.field_info);

let validity = array.validity();

let mut buffer = vec![];
levels::write_rep_levels(&mut buffer, &nested, options.version)?;
let repetition_levels_byte_length = buffer.len();

levels::write_def_levels(&mut buffer, &nested, validity, is_optional, options.version)?;
let definition_levels_byte_length = buffer.len() - repetition_levels_byte_length;
let (repetition_levels_byte_length, definition_levels_byte_length) =
nested::write_rep_and_def(options.version, &nested, &mut buffer)?;

encode_plain(array, is_optional, &mut buffer)?;

Expand All @@ -39,8 +32,8 @@ where

utils::build_plain_page(
buffer,
levels::num_values(nested.offsets()),
nested.offsets().len().saturating_sub(1),
nested::num_values(&nested),
nested[0].len(),
array.null_count(),
repetition_levels_byte_length,
definition_levels_byte_length,
Expand Down
Loading