From e0ec3814e7ddaa3c635e3cf85f1a05d849bb4264 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Tue, 21 Jun 2022 05:32:04 +0000 Subject: [PATCH] Accept i64 for u32 --- src/io/parquet/read/deserialize/mod.rs | 31 +++++++++++++++++------ src/io/parquet/read/deserialize/simple.rs | 27 +++++++++++++++----- src/io/parquet/read/statistics/mod.rs | 12 ++++++++- tests/it/io/parquet/read.rs | 5 ++++ 4 files changed, 60 insertions(+), 15 deletions(-) diff --git a/src/io/parquet/read/deserialize/mod.rs b/src/io/parquet/read/deserialize/mod.rs index 433d807baff..d4a7de6ee80 100644 --- a/src/io/parquet/read/deserialize/mod.rs +++ b/src/io/parquet/read/deserialize/mod.rs @@ -188,14 +188,29 @@ where } UInt32 => { init.push(InitNested::Primitive(field.is_nullable)); - types.pop(); - primitive::iter_to_arrays_nested( - columns.pop().unwrap(), - init, - field.data_type().clone(), - chunk_size, - |x: i32| x as u32, - ) + let type_ = types.pop().unwrap(); + match type_.physical_type { + PhysicalType::Int32 => primitive::iter_to_arrays_nested( + columns.pop().unwrap(), + init, + field.data_type().clone(), + chunk_size, + |x: i32| x as u32, + ), + // some implementations of parquet write arrow's u32 into i64. + PhysicalType::Int64 => primitive::iter_to_arrays_nested( + columns.pop().unwrap(), + init, + field.data_type().clone(), + chunk_size, + |x: i64| x as u32, + ), + other => { + return Err(Error::nyi(format!( + "Deserializing UInt32 from {other:?}'s parquet" + ))) + } + } } UInt64 => { init.push(InitNested::Primitive(field.is_nullable)); diff --git a/src/io/parquet/read/deserialize/simple.rs b/src/io/parquet/read/deserialize/simple.rs index 6d3026c18b0..8a8ef653da6 100644 --- a/src/io/parquet/read/deserialize/simple.rs +++ b/src/io/parquet/read/deserialize/simple.rs @@ -82,12 +82,27 @@ pub fn page_iter_to_arrays<'a, I: 'a + DataPages>( chunk_size, |x: i32| x as u16, ))), - UInt32 => dyn_iter(iden(primitive::Iter::new( - pages, - data_type, - chunk_size, - |x: i32| x as u32, - ))), + UInt32 => match physical_type { + PhysicalType::Int32 => dyn_iter(iden(primitive::Iter::new( + pages, + data_type, + chunk_size, + |x: i32| x as u32, + ))), + // some implementations of parquet write arrow's u32 into i64. + PhysicalType::Int64 => dyn_iter(iden(primitive::Iter::new( + pages, + data_type, + chunk_size, + |x: i64| x as u32, + ))), + other => { + return Err(Error::NotYetImplemented(format!( + "Reading uin32 from {:?}-encoded parquet still not implemented", + other + ))) + } + }, Int8 => dyn_iter(iden(primitive::Iter::new( pages, data_type, diff --git a/src/io/parquet/read/statistics/mod.rs b/src/io/parquet/read/statistics/mod.rs index 4f3eaeff7bd..540dac4df97 100644 --- a/src/io/parquet/read/statistics/mod.rs +++ b/src/io/parquet/read/statistics/mod.rs @@ -410,7 +410,17 @@ fn push( } UInt8 => primitive::push(from, min, max, |x: i32| Ok(x as u8)), UInt16 => primitive::push(from, min, max, |x: i32| Ok(x as u16)), - UInt32 => primitive::push(from, min, max, |x: i32| Ok(x as u32)), + UInt32 => match physical_type { + // some implementations of parquet write arrow's u32 into i64. + ParquetPhysicalType::Int64 => primitive::push(from, min, max, |x: i64| Ok(x as u32)), + ParquetPhysicalType::Int32 => primitive::push(from, min, max, |x: i32| Ok(x as u32)), + other => { + return Err(Error::NotYetImplemented(format!( + "Can't decode UInt32 type from parquet type {:?}", + other + ))) + } + }, Int32 => primitive::push(from, min, max, |x: i32| Ok(x as i32)), Int64 | Date64 | Time64(_) | Duration(_) => { primitive::push(from, min, max, |x: i64| Ok(x as i64)) diff --git a/tests/it/io/parquet/read.rs b/tests/it/io/parquet/read.rs index 4e426ef5bae..356d74e193e 100644 --- a/tests/it/io/parquet/read.rs +++ b/tests/it/io/parquet/read.rs @@ -135,6 +135,11 @@ fn v1_u32_nullable() -> Result<()> { test_pyarrow_integration("uint32", 1, "basic", false, false, None) } +#[test] +fn v2_u32_nullable() -> Result<()> { + test_pyarrow_integration("uint32", 2, "basic", false, false, None) +} + #[test] fn v2_int64_nullable() -> Result<()> { test_pyarrow_integration("int64", 2, "basic", false, false, None)