Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Added support for deserializing JSON from iterator #989

Merged
merged 12 commits into from May 17, 2022
53 changes: 53 additions & 0 deletions src/array/utf8/json.rs
@@ -0,0 +1,53 @@
use indexmap::set::IndexSet as HashSet;
use serde_json;
use serde_json::Value;

use super::{Offset, Utf8Array};
use crate::{
array::ArrayRef,
datatypes::DataType,
error::{ArrowError, Result},
io::json::read::{_deserialize, coerce_data_type, infer},
};

/// Implements json deserialization from a Utf8Array
impl<O: Offset> Utf8Array<O> {
/// Infers the DataType from a number of JSON rows in a Utf8Array
pub fn json_infer(&self, number_of_rows: Option<usize>) -> Result<DataType> {
if self.len() == 0 {
return Err(ArrowError::ExternalFormat(
"Cannot infer JSON types on empty Utf8Array".to_string(),
));
}

// Use the full length if no limit is provided
let number_of_rows = number_of_rows.unwrap_or(self.len());

let data_types_iter = self.iter().take(number_of_rows).flatten();

let mut data_types = HashSet::new();
for row in data_types_iter {
let v: Value = serde_json::from_str(row)?;
let data_type = infer(&v)?;
if data_type != DataType::Null {
data_types.insert(data_type);
}
}

let v: Vec<&DataType> = data_types.iter().collect();
Ok(coerce_data_type(&v))
}

/// Deserializes JSON values based on an optional DataType
pub fn json_deserialize(&self, data_type: DataType) -> Result<ArrayRef> {
let rows = self
.iter()
.map(|row| match row {
Some(row) => serde_json::from_str(row).map_err(ArrowError::from),
None => Ok(Value::Null),
})
.collect::<Result<Vec<Value>>>()?;

Ok(_deserialize(&rows, data_type))
}
}
5 changes: 5 additions & 0 deletions src/array/utf8/mod.rs
Expand Up @@ -19,6 +19,11 @@ mod mutable;
pub use iterator::*;
pub use mutable::*;

#[cfg(feature = "io_json")]
mod json;
#[cfg(feature = "io_json")]
pub use json::*;

/// A [`Utf8Array`] is arrow's equivalent of an immutable `Vec<Option<String>>`.
/// Cloning and slicing this struct is `O(1)`.
/// # Example
Expand Down
6 changes: 5 additions & 1 deletion src/io/json/read/deserialize.rs
Expand Up @@ -146,17 +146,21 @@ fn deserialize_struct<A: Borrow<Value>>(rows: &[A], data_type: DataType) -> Stru
.map(|f| (&f.name, (f.data_type(), vec![])))
.collect::<HashMap<_, _>>();

let mut validity = MutableBitmap::with_capacity(rows.len());

rows.iter().for_each(|row| {
match row.borrow() {
Value::Object(value) => {
values
.iter_mut()
.for_each(|(s, (_, inner))| inner.push(value.get(*s).unwrap_or(&Value::Null)));
validity.push(true);
}
_ => {
values
.iter_mut()
.for_each(|(_, (_, inner))| inner.push(&Value::Null));
validity.push(false);
}
};
});
Expand All @@ -166,7 +170,7 @@ fn deserialize_struct<A: Borrow<Value>>(rows: &[A], data_type: DataType) -> Stru
.map(|(_, (data_type, values))| _deserialize(&values, data_type.clone()))
.collect::<Vec<_>>();

StructArray::new(data_type, values, None)
StructArray::new(data_type, values, validity.into())
}

fn deserialize_dictionary<K: DictionaryKey, A: Borrow<Value>>(
Expand Down
23 changes: 23 additions & 0 deletions tests/it/array/utf8/json.rs
@@ -0,0 +1,23 @@
use arrow2::array::{Array, StructArray, Utf8Array};

#[test]
fn json_deserialize() {
let array = Utf8Array::<i64>::from([
Some(r#"{"a": 1, "b": [{"c": 0}, {"c": 1}]}"#),
None,
Some(r#"{"a": 2, "b": [{"c": 2}, {"c": 5}]}"#),
None,
]);
let data_type = array.json_infer(None).unwrap();
let new_array = array.json_deserialize(data_type).unwrap();

// Explicitly cast as StructArray
let new_array = new_array.as_any().downcast_ref::<StructArray>().unwrap();

assert_eq!(array.len(), new_array.len());
assert_eq!(array.null_count(), new_array.null_count());
assert_eq!(array.validity().unwrap(), new_array.validity().unwrap());

let field_names: Vec<String> = new_array.fields().iter().map(|f| f.name.clone()).collect();
assert_eq!(field_names, vec!["a".to_string(), "b".to_string()]);
}
2 changes: 2 additions & 0 deletions tests/it/array/utf8/mod.rs
@@ -1,5 +1,7 @@
use arrow2::{array::*, bitmap::Bitmap, buffer::Buffer, datatypes::DataType, error::Result};

#[cfg(feature = "io_json")]
mod json;
mod mutable;
mod to_mutable;

Expand Down
18 changes: 15 additions & 3 deletions tests/it/io/ndjson/mod.rs
Expand Up @@ -220,11 +220,19 @@ fn case_struct() -> (String, Arc<dyn Array>) {

// build expected output
let d = Utf8Array::<i32>::from(&vec![Some("text"), None, Some("text"), None]);
let c = StructArray::from_data(DataType::Struct(vec![d_field]), vec![Arc::new(d)], None);
let c = StructArray::from_data(
DataType::Struct(vec![d_field]),
vec![Arc::new(d)],
Some(Bitmap::from_u8_slice([0b11111101], 4)),
);

let b = BooleanArray::from(vec![Some(true), Some(false), Some(true), None]);
let inner = DataType::Struct(vec![Field::new("b", DataType::Boolean, true), c_field]);
let expected = StructArray::from_data(inner, vec![Arc::new(b), Arc::new(c)], None);
let expected = StructArray::from_data(
inner,
vec![Arc::new(b), Arc::new(c)],
Some(Bitmap::from_u8_slice([0b11110111], 4)),
);

let data_type = DataType::Struct(fields);

Expand Down Expand Up @@ -268,7 +276,11 @@ fn case_nested_list() -> (String, Arc<dyn Array>) {
None,
]);

let c = StructArray::from_data(DataType::Struct(vec![d_field]), vec![Arc::new(d)], None);
let c = StructArray::from_data(
DataType::Struct(vec![d_field]),
vec![Arc::new(d)],
Some(Bitmap::from_u8_slice([0b11111011], 6)),
);

let b = BooleanArray::from(vec![
Some(true),
Expand Down