Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Accept decoding parquet's i64 into u32 written by pyarrow #1090

Merged
merged 1 commit into from
Jun 22, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 23 additions & 8 deletions src/io/parquet/read/deserialize/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -188,14 +188,29 @@ where
}
UInt32 => {
init.push(InitNested::Primitive(field.is_nullable));
types.pop();
primitive::iter_to_arrays_nested(
columns.pop().unwrap(),
init,
field.data_type().clone(),
chunk_size,
|x: i32| x as u32,
)
let type_ = types.pop().unwrap();
match type_.physical_type {
PhysicalType::Int32 => primitive::iter_to_arrays_nested(
columns.pop().unwrap(),
init,
field.data_type().clone(),
chunk_size,
|x: i32| x as u32,
),
// some implementations of parquet write arrow's u32 into i64.
PhysicalType::Int64 => primitive::iter_to_arrays_nested(
columns.pop().unwrap(),
init,
field.data_type().clone(),
chunk_size,
|x: i64| x as u32,
),
other => {
return Err(Error::nyi(format!(
"Deserializing UInt32 from {other:?}'s parquet"
)))
}
}
}
UInt64 => {
init.push(InitNested::Primitive(field.is_nullable));
Expand Down
27 changes: 21 additions & 6 deletions src/io/parquet/read/deserialize/simple.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,12 +82,27 @@ pub fn page_iter_to_arrays<'a, I: 'a + DataPages>(
chunk_size,
|x: i32| x as u16,
))),
UInt32 => dyn_iter(iden(primitive::Iter::new(
pages,
data_type,
chunk_size,
|x: i32| x as u32,
))),
UInt32 => match physical_type {
PhysicalType::Int32 => dyn_iter(iden(primitive::Iter::new(
pages,
data_type,
chunk_size,
|x: i32| x as u32,
))),
// some implementations of parquet write arrow's u32 into i64.
PhysicalType::Int64 => dyn_iter(iden(primitive::Iter::new(
pages,
data_type,
chunk_size,
|x: i64| x as u32,
))),
other => {
return Err(Error::NotYetImplemented(format!(
"Reading uin32 from {:?}-encoded parquet still not implemented",
other
)))
}
},
Int8 => dyn_iter(iden(primitive::Iter::new(
pages,
data_type,
Expand Down
12 changes: 11 additions & 1 deletion src/io/parquet/read/statistics/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -410,7 +410,17 @@ fn push(
}
UInt8 => primitive::push(from, min, max, |x: i32| Ok(x as u8)),
UInt16 => primitive::push(from, min, max, |x: i32| Ok(x as u16)),
UInt32 => primitive::push(from, min, max, |x: i32| Ok(x as u32)),
UInt32 => match physical_type {
// some implementations of parquet write arrow's u32 into i64.
ParquetPhysicalType::Int64 => primitive::push(from, min, max, |x: i64| Ok(x as u32)),
ParquetPhysicalType::Int32 => primitive::push(from, min, max, |x: i32| Ok(x as u32)),
other => {
return Err(Error::NotYetImplemented(format!(
"Can't decode UInt32 type from parquet type {:?}",
other
)))
}
},
Int32 => primitive::push(from, min, max, |x: i32| Ok(x as i32)),
Int64 | Date64 | Time64(_) | Duration(_) => {
primitive::push(from, min, max, |x: i64| Ok(x as i64))
Expand Down
6 changes: 5 additions & 1 deletion tests/it/io/parquet/read.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,11 +130,15 @@ fn v1_timestamp_ms_nullable() -> Result<()> {
}

#[test]
#[ignore] // pyarrow issue; see https://issues.apache.org/jira/browse/ARROW-12201
fn v1_u32_nullable() -> Result<()> {
test_pyarrow_integration("uint32", 1, "basic", false, false, None)
}

#[test]
fn v2_u32_nullable() -> Result<()> {
test_pyarrow_integration("uint32", 2, "basic", false, false, None)
}

#[test]
fn v2_int64_nullable() -> Result<()> {
test_pyarrow_integration("int64", 2, "basic", false, false, None)
Expand Down