Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Simpler nested parquet read #1241

Merged
merged 1 commit into from
Sep 2, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 13 additions & 16 deletions src/io/parquet/read/deserialize/nested_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -408,27 +408,20 @@ fn extend_offsets2<'a, D: NestedDecoder<'a>>(
decoder: &D,
additional: usize,
) -> Result<()> {
let mut values_count = vec![0; nested.len()];
let max_depth = nested.len();

for (depth, nest) in nested.iter().enumerate().skip(1) {
values_count[depth - 1] = nest.len() as i64
}
*values_count.last_mut().unwrap() = nested.last().unwrap().len() as i64;

let mut cum_sum = vec![0u32; nested.len() + 1];
let mut cum_sum = vec![0u32; max_depth + 1];
for (i, nest) in nested.iter().enumerate() {
let delta = nest.is_nullable() as u32 + nest.is_repeated() as u32;
cum_sum[i + 1] = cum_sum[i] + delta;
}

let mut cum_rep = vec![0u32; nested.len() + 1];
let mut cum_rep = vec![0u32; max_depth + 1];
for (i, nest) in nested.iter().enumerate() {
let delta = nest.is_repeated() as u32;
cum_rep[i + 1] = cum_rep[i] + delta;
}

let max_depth = nested.len() - 1;

let mut rows = 0;
while let Some((rep, def)) = page.iter.next() {
let rep = rep?;
Expand All @@ -438,22 +431,26 @@ fn extend_offsets2<'a, D: NestedDecoder<'a>>(
}

let mut is_required = false;
for (depth, nest) in nested.iter_mut().enumerate() {
for depth in 0..max_depth {
let right_level = rep <= cum_rep[depth] && def >= cum_sum[depth];
if is_required || right_level {
let length = nested
.get(depth + 1)
.map(|x| x.len() as i64)
// the last depth is the leaf, which is always increased by 1
.unwrap_or(1);

let nest = &mut nested[depth];

let is_valid = nest.is_nullable() && def > cum_sum[depth];
let length = values_count[depth];
nest.push(length, is_valid);
if depth > 0 {
values_count[depth - 1] = nest.len() as i64;
};
if nest.is_required() && !is_valid {
is_required = true;
} else {
is_required = false
};

if depth == max_depth {
if depth == max_depth - 1 {
// the leaf / primitive
let is_valid = (def != cum_sum[depth]) || !nest.is_nullable();
if right_level && is_valid {
Expand Down