Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

add dictionary serialization for csv-writer #515

Merged
merged 2 commits into from
Oct 10, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 49 additions & 2 deletions src/io/csv/write/serialize.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use lexical_core::ToLexical;

use crate::temporal_conversions;
use crate::types::NativeType;
use crate::types::{Index, NativeType};
use crate::util::lexical_to_bytes_mut;
use crate::{
array::{Array, BinaryArray, BooleanArray, PrimitiveArray, Utf8Array},
Expand All @@ -10,6 +10,10 @@ use crate::{
};

use super::iterator::{BufStreamingIterator, StreamingIterator};
use crate::array::{DictionaryArray, DictionaryKey, Offset};
use crate::bitmap::utils::ZipValidity;
use std::any::Any;
use std::slice::Iter;

/// Options to serialize logical types to CSV
#[derive(Debug, PartialEq, Eq, Hash, Clone)]
Expand Down Expand Up @@ -265,6 +269,49 @@ pub fn new_serializer<'a>(
vec![],
))
}
_ => todo!(),
DataType::Dictionary(keys_dt, values_dt) => match &**values_dt {
DataType::LargeUtf8 => match &**keys_dt {
DataType::UInt32 => serialize_utf8_dict::<u32, i64>(array.as_any()),
DataType::UInt64 => serialize_utf8_dict::<u64, i64>(array.as_any()),
_ => todo!(),
},
DataType::Utf8 => match &**keys_dt {
DataType::UInt32 => serialize_utf8_dict::<u32, i32>(array.as_any()),
DataType::UInt64 => serialize_utf8_dict::<u64, i32>(array.as_any()),
_ => todo!(),
},
_ => {
panic!("only dictionary with string values are supported by csv writer")
}
},
dt => panic!("data type: {} not supported by csv writer", dt),
})
}

/// Helper for serializing a dictonary array. The generic parameters are:
/// - `K` for the type of the keys of the dictionary
/// - `O` for the type of the offsets in the Utf8Array: {i32, i64}
fn serialize_utf8_dict<'a, K: DictionaryKey + Index, O: Offset>(
array: &'a dyn Any,
) -> Box<dyn StreamingIterator<Item = [u8]> + 'a> {
let array = array.downcast_ref::<DictionaryArray<K>>().unwrap();
let keys = array.keys();
let values = array
.values()
.as_any()
.downcast_ref::<Utf8Array<O>>()
.unwrap();
Box::new(BufStreamingIterator::new(
keys.iter(),
move |x, buf| {
if let Some(x) = x {
let i = Index::to_usize(x);
if !values.is_null(i) {
let val = values.value(i);
buf.extend_from_slice(val.as_bytes());
}
}
},
vec![],
))
}
28 changes: 18 additions & 10 deletions tests/it/io/csv/write.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@ fn data() -> RecordBatch {
Field::new("c4", DataType::Boolean, true),
Field::new("c5", DataType::Timestamp(TimeUnit::Millisecond, None), true),
Field::new("c6", DataType::Time32(TimeUnit::Second), false),
Field::new(
"c7",
DataType::Dictionary(Box::new(DataType::UInt32), Box::new(DataType::Utf8)),
false,
),
]);

let c1 = Utf8Array::<i32>::from_slice([
Expand All @@ -29,6 +34,8 @@ fn data() -> RecordBatch {
.to(DataType::Timestamp(TimeUnit::Millisecond, None));
let c6 = PrimitiveArray::<i32>::from_slice(&[1234, 24680, 85563])
.to(DataType::Time32(TimeUnit::Second));
let keys = UInt32Array::from_slice(&[2, 0, 1]);
let c7 = DictionaryArray::from_data(keys, Arc::new(c1.clone()));

RecordBatch::try_new(
Arc::new(schema),
Expand All @@ -39,6 +46,7 @@ fn data() -> RecordBatch {
Arc::new(c4),
Arc::new(c5),
Arc::new(c6),
Arc::new(c7),
],
)
.unwrap()
Expand All @@ -61,13 +69,13 @@ fn write_csv() -> Result<()> {
// check
let buffer = writer.into_inner().unwrap().into_inner();
assert_eq!(
r#"c1,c2,c3,c4,c5,c6
Lorem ipsum dolor sit amet,123.564532,3,true,,00:20:34
consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378000000,06:51:20
sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03
Lorem ipsum dolor sit amet,123.564532,3,true,,00:20:34
consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378000000,06:51:20
sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03
r#"c1,c2,c3,c4,c5,c6,c7
Lorem ipsum dolor sit amet,123.564532,3,true,,00:20:34,sed do eiusmod tempor
consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378000000,06:51:20,Lorem ipsum dolor sit amet
sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,consectetur adipiscing elit
Lorem ipsum dolor sit amet,123.564532,3,true,,00:20:34,sed do eiusmod tempor
consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378000000,06:51:20,Lorem ipsum dolor sit amet
sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,consectetur adipiscing elit
"#
.to_string(),
String::from_utf8(buffer).unwrap(),
Expand All @@ -92,9 +100,9 @@ fn write_csv_custom_options() -> Result<()> {
// check
let buffer = writer.into_inner().unwrap().into_inner();
assert_eq!(
r#"Lorem ipsum dolor sit amet|123.564532|3|true||12:20:34 AM
consectetur adipiscing elit||2|false|2019-04-18T10:54:47.378000000|06:51:20 AM
sed do eiusmod tempor|-556132.25|1||2019-04-18T02:45:55.555000000|11:46:03 PM
r#"Lorem ipsum dolor sit amet|123.564532|3|true||12:20:34 AM|sed do eiusmod tempor
consectetur adipiscing elit||2|false|2019-04-18T10:54:47.378000000|06:51:20 AM|Lorem ipsum dolor sit amet
sed do eiusmod tempor|-556132.25|1||2019-04-18T02:45:55.555000000|11:46:03 PM|consectetur adipiscing elit
"#
.to_string(),
String::from_utf8(buffer).unwrap(),
Expand Down