Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Commit

Permalink
Fix the inferred nullability when converting a nested parquet schema …
Browse files Browse the repository at this point in the history
…to arrow (#1565)
  • Loading branch information
jhorstmann committed Oct 7, 2023
1 parent 710d6b3 commit dd80c89
Showing 1 changed file with 58 additions and 16 deletions.
74 changes: 58 additions & 16 deletions src/io/parquet/read/schema/convert.rs
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ fn to_group_type(
pub(crate) fn is_nullable(field_info: &FieldInfo) -> bool {
match field_info.repetition {
Repetition::Optional => true,
Repetition::Repeated => true,
Repetition::Repeated => false,
Repetition::Required => false,
}
}
Expand Down Expand Up @@ -353,12 +353,12 @@ fn to_list(
let field = fields.first().unwrap();
(
&field.get_field_info().name,
field.get_field_info().repetition != Repetition::Required,
field.get_field_info().repetition == Repetition::Optional,
)
}
_ => (
&item.get_field_info().name,
item.get_field_info().repetition != Repetition::Required,
item.get_field_info().repetition == Repetition::Optional,
),
};

Expand Down Expand Up @@ -611,7 +611,7 @@ mod tests {
{
arrow_fields.push(Field::new(
"my_list",
DataType::List(Box::new(Field::new("element", DataType::Utf8, true))),
DataType::List(Box::new(Field::new("element", DataType::Utf8, false))),
true,
));
}
Expand All @@ -623,7 +623,7 @@ mod tests {
{
arrow_fields.push(Field::new(
"my_list",
DataType::List(Box::new(Field::new("element", DataType::Int32, true))),
DataType::List(Box::new(Field::new("element", DataType::Int32, false))),
true,
));
}
Expand All @@ -642,7 +642,7 @@ mod tests {
]);
arrow_fields.push(Field::new(
"my_list",
DataType::List(Box::new(Field::new("element", arrow_struct, true))),
DataType::List(Box::new(Field::new("element", arrow_struct, false))),
true,
));
}
Expand All @@ -658,7 +658,7 @@ mod tests {
let arrow_struct = DataType::Struct(vec![Field::new("str", DataType::Utf8, false)]);
arrow_fields.push(Field::new(
"my_list",
DataType::List(Box::new(Field::new("array", arrow_struct, true))),
DataType::List(Box::new(Field::new("array", arrow_struct, false))),
true,
));
}
Expand All @@ -674,7 +674,7 @@ mod tests {
let arrow_struct = DataType::Struct(vec![Field::new("str", DataType::Utf8, false)]);
arrow_fields.push(Field::new(
"my_list",
DataType::List(Box::new(Field::new("my_list_tuple", arrow_struct, true))),
DataType::List(Box::new(Field::new("my_list_tuple", arrow_struct, false))),
true,
));
}
Expand All @@ -684,8 +684,50 @@ mod tests {
{
arrow_fields.push(Field::new(
"name",
DataType::List(Box::new(Field::new("name", DataType::Int32, true))),
true,
DataType::List(Box::new(Field::new("name", DataType::Int32, false))),
false,
));
}

let parquet_schema = SchemaDescriptor::try_from_message(message_type)?;
let fields = parquet_to_arrow_schema(parquet_schema.fields());

assert_eq!(arrow_fields, fields);
Ok(())
}

#[test]
fn test_parquet_list_with_struct() -> Result<()> {
let mut arrow_fields = Vec::new();

let message_type = "
message eventlog {
REQUIRED group events (LIST) {
REPEATED group array {
REQUIRED BYTE_ARRAY event_name (STRING);
REQUIRED INT64 event_time (TIMESTAMP(MILLIS,true));
}
}
}
";

{
let struct_fields = vec![
Field::new("event_name", DataType::Utf8, false),
Field::new(
"event_time",
DataType::Timestamp(TimeUnit::Millisecond, Some("+00:00".into())),
false,
),
];
arrow_fields.push(Field::new(
"events",
DataType::List(Box::new(Field::new(
"array",
DataType::Struct(struct_fields),
false,
))),
false,
));
}

Expand Down Expand Up @@ -812,9 +854,9 @@ mod tests {
DataType::List(Box::new(Field::new(
"innerGroup",
DataType::Struct(vec![Field::new("leaf3", DataType::Int32, true)]),
true,
false,
))),
true,
false,
);

let outer_group_list = Field::new(
Expand All @@ -825,9 +867,9 @@ mod tests {
Field::new("leaf2", DataType::Int32, true),
inner_group_list,
]),
true,
false,
))),
true,
false,
);
arrow_fields.push(outer_group_list);
}
Expand Down Expand Up @@ -888,8 +930,8 @@ mod tests {
Field::new("string", DataType::Utf8, true),
Field::new(
"bools",
DataType::List(Box::new(Field::new("bools", DataType::Boolean, true))),
true,
DataType::List(Box::new(Field::new("bools", DataType::Boolean, false))),
false,
),
Field::new("date", DataType::Date32, true),
Field::new("time_milli", DataType::Time32(TimeUnit::Millisecond), true),
Expand Down

0 comments on commit dd80c89

Please sign in to comment.