Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Added support for FFI of dictionary-encoded arrays #267

Merged
merged 3 commits into from Aug 11, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
16 changes: 15 additions & 1 deletion arrow-pyarrow-integration-testing/tests/test_sql.py
Expand Up @@ -104,7 +104,7 @@ def test_decimal_roundtrip(self):
data = [
round(decimal.Decimal(722.82), 2),
round(decimal.Decimal(-934.11), 2),
None
None,
]
a = pyarrow.array(data, pyarrow.decimal128(5, 2))
b = arrow_pyarrow_integration_testing.round_trip(a)
Expand Down Expand Up @@ -179,3 +179,17 @@ def test_list_list_array(self):
b.validate(full=True)
assert a.to_pylist() == b.to_pylist()
assert a.type == b.type

def test_dict(self):
"""
Python -> Rust -> Python
"""
a = pyarrow.array(
["a", "a", "b", None, "c"],
pyarrow.dictionary(pyarrow.int64(), pyarrow.utf8()),
)
b = arrow_pyarrow_integration_testing.round_trip(a)

b.validate(full=True)
assert a.to_pylist() == b.to_pylist()
assert a.type == b.type
2 changes: 1 addition & 1 deletion src/array/binary/ffi.rs
Expand Up @@ -25,7 +25,7 @@ unsafe impl<O: Offset> ToFfi for BinaryArray<O> {

unsafe impl<O: Offset, A: ffi::ArrowArrayRef> FromFfi<A> for BinaryArray<O> {
fn try_from_ffi(array: A) -> Result<Self> {
let data_type = array.data_type()?;
let data_type = array.field()?.data_type().clone();
let expected = if O::is_large() {
DataType::LargeBinary
} else {
Expand Down
3 changes: 3 additions & 0 deletions src/array/boolean/ffi.rs
@@ -1,5 +1,6 @@
use crate::{
array::{FromFfi, ToFfi},
datatypes::DataType,
ffi,
};

Expand All @@ -22,6 +23,8 @@ unsafe impl ToFfi for BooleanArray {

unsafe impl<A: ffi::ArrowArrayRef> FromFfi<A> for BooleanArray {
fn try_from_ffi(array: A) -> Result<Self> {
let data_type = array.field()?.data_type().clone();
assert_eq!(data_type, DataType::Boolean);
let length = array.array().len();
let offset = array.array().offset();
let mut validity = unsafe { array.validity() }?;
Expand Down
39 changes: 39 additions & 0 deletions src/array/dictionary/ffi.rs
@@ -0,0 +1,39 @@
use crate::{
array::{FromFfi, PrimitiveArray, ToFfi},
error::Result,
ffi,
};

use super::{DictionaryArray, DictionaryKey};

unsafe impl<K: DictionaryKey> ToFfi for DictionaryArray<K> {
fn buffers(&self) -> Vec<Option<std::ptr::NonNull<u8>>> {
self.keys.buffers()
}

#[inline]
fn offset(&self) -> usize {
self.offset
}
}

unsafe impl<K: DictionaryKey, A: ffi::ArrowArrayRef> FromFfi<A> for DictionaryArray<K> {
fn try_from_ffi(array: A) -> Result<Self> {
// keys: similar to PrimitiveArray, but the datatype is the inner one
let length = array.array().len();
let offset = array.array().offset();
let mut validity = unsafe { array.validity() }?;
let mut values = unsafe { array.buffer::<K>(0) }?;

if offset > 0 {
values = values.slice(offset, length);
validity = validity.map(|x| x.slice(offset, length))
}
let keys = PrimitiveArray::<K>::from_data(K::DATA_TYPE, values, validity);
// values
let values = array.dictionary()?.unwrap();
let values = ffi::try_from(values)?.into();

Ok(DictionaryArray::<K>::from_data(keys, values))
}
}
14 changes: 2 additions & 12 deletions src/array/dictionary/mod.rs
Expand Up @@ -6,12 +6,13 @@ use crate::{
types::{NativeType, NaturalDataType},
};

mod ffi;
mod iterator;
mod mutable;
pub use iterator::*;
pub use mutable::*;

use super::{ffi::ToFfi, new_empty_array, primitive::PrimitiveArray, Array};
use super::{new_empty_array, primitive::PrimitiveArray, Array};

/// Trait denoting [`NativeType`]s that can be used as keys of a dictionary.
pub trait DictionaryKey: NativeType + NaturalDataType + num::NumCast + num::FromPrimitive {}
Expand Down Expand Up @@ -143,14 +144,3 @@ where
write!(f, "}}")
}
}

unsafe impl<K: DictionaryKey> ToFfi for DictionaryArray<K> {
fn buffers(&self) -> Vec<Option<std::ptr::NonNull<u8>>> {
vec![self.keys.validity().as_ref().map(|x| x.as_ptr())]
}

#[inline]
fn offset(&self) -> usize {
self.offset
}
}
37 changes: 26 additions & 11 deletions src/array/ffi.rs
Expand Up @@ -29,13 +29,28 @@ pub unsafe trait FromFfi<T: ffi::ArrowArrayRef>: Sized {
macro_rules! ffi_dyn {
($array:expr, $ty:ty) => {{
let array = $array.as_any().downcast_ref::<$ty>().unwrap();
(array.buffers(), array.children())
(array.buffers(), array.children(), None)
}};
}

type BuffersChildren = (Vec<Option<std::ptr::NonNull<u8>>>, Vec<Arc<dyn Array>>);
macro_rules! ffi_dict_dyn {
($array:expr, $ty:ty) => {{
let array = $array.as_any().downcast_ref::<$ty>().unwrap();
(
array.buffers(),
array.children(),
Some(array.values().clone()),
)
}};
}

type BuffersChildren = (
Vec<Option<std::ptr::NonNull<u8>>>,
Vec<Arc<dyn Array>>,
Option<Arc<dyn Array>>,
);

pub fn buffers_children(array: &dyn Array) -> BuffersChildren {
pub fn buffers_children_dictionary(array: &dyn Array) -> BuffersChildren {
match array.data_type() {
DataType::Null => ffi_dyn!(array, NullArray),
DataType::Boolean => ffi_dyn!(array, BooleanArray),
Expand Down Expand Up @@ -72,14 +87,14 @@ pub fn buffers_children(array: &dyn Array) -> BuffersChildren {
DataType::Struct(_) => ffi_dyn!(array, StructArray),
DataType::Union(_) => unimplemented!(),
DataType::Dictionary(key_type, _) => match key_type.as_ref() {
DataType::Int8 => ffi_dyn!(array, DictionaryArray::<i8>),
DataType::Int16 => ffi_dyn!(array, DictionaryArray::<i16>),
DataType::Int32 => ffi_dyn!(array, DictionaryArray::<i32>),
DataType::Int64 => ffi_dyn!(array, DictionaryArray::<i64>),
DataType::UInt8 => ffi_dyn!(array, DictionaryArray::<u8>),
DataType::UInt16 => ffi_dyn!(array, DictionaryArray::<u16>),
DataType::UInt32 => ffi_dyn!(array, DictionaryArray::<u32>),
DataType::UInt64 => ffi_dyn!(array, DictionaryArray::<u64>),
DataType::Int8 => ffi_dict_dyn!(array, DictionaryArray::<i8>),
DataType::Int16 => ffi_dict_dyn!(array, DictionaryArray::<i16>),
DataType::Int32 => ffi_dict_dyn!(array, DictionaryArray::<i32>),
DataType::Int64 => ffi_dict_dyn!(array, DictionaryArray::<i64>),
DataType::UInt8 => ffi_dict_dyn!(array, DictionaryArray::<u8>),
DataType::UInt16 => ffi_dict_dyn!(array, DictionaryArray::<u16>),
DataType::UInt32 => ffi_dict_dyn!(array, DictionaryArray::<u32>),
DataType::UInt64 => ffi_dict_dyn!(array, DictionaryArray::<u64>),
_ => unreachable!(),
},
}
Expand Down
2 changes: 1 addition & 1 deletion src/array/list/ffi.rs
Expand Up @@ -24,7 +24,7 @@ unsafe impl<O: Offset> ToFfi for ListArray<O> {

unsafe impl<O: Offset, A: ffi::ArrowArrayRef> FromFfi<A> for ListArray<O> {
fn try_from_ffi(array: A) -> Result<Self> {
let data_type = array.data_type()?;
let data_type = array.field()?.data_type().clone();
let length = array.array().len();
let offset = array.array().offset();
let mut validity = unsafe { array.validity() }?;
Expand Down
2 changes: 1 addition & 1 deletion src/array/mod.rs
Expand Up @@ -401,7 +401,7 @@ pub use specification::Offset;
pub use struct_::StructArray;
pub use utf8::{MutableUtf8Array, Utf8Array, Utf8ValuesIter};

pub(crate) use self::ffi::buffers_children;
pub(crate) use self::ffi::buffers_children_dictionary;
pub use self::ffi::FromFfi;
pub use self::ffi::ToFfi;

Expand Down
2 changes: 1 addition & 1 deletion src/array/primitive/ffi.rs
Expand Up @@ -24,7 +24,7 @@ unsafe impl<T: NativeType> ToFfi for PrimitiveArray<T> {

unsafe impl<T: NativeType, A: ffi::ArrowArrayRef> FromFfi<A> for PrimitiveArray<T> {
fn try_from_ffi(array: A) -> Result<Self> {
let data_type = array.data_type()?;
let data_type = array.field()?.data_type().clone();
let length = array.array().len();
let offset = array.array().offset();
let mut validity = unsafe { array.validity() }?;
Expand Down
8 changes: 4 additions & 4 deletions src/array/struct_.rs
Expand Up @@ -77,8 +77,8 @@ impl StructArray {
}

impl StructArray {
pub fn get_fields(datatype: &DataType) -> &[Field] {
if let DataType::Struct(fields) = datatype {
pub fn get_fields(data_type: &DataType) -> &[Field] {
if let DataType::Struct(fields) = data_type {
fields
} else {
panic!("Wrong datatype passed to Struct.")
Expand Down Expand Up @@ -139,8 +139,8 @@ unsafe impl ToFfi for StructArray {

unsafe impl<A: ffi::ArrowArrayRef> FromFfi<A> for StructArray {
fn try_from_ffi(array: A) -> Result<Self> {
let data_type = array.data_type()?;
let fields = Self::get_fields(&data_type).to_vec();
let field = array.field()?;
let fields = Self::get_fields(field.data_type()).to_vec();

let length = array.array().len();
let offset = array.array().offset();
Expand Down
30 changes: 26 additions & 4 deletions src/ffi/array.rs
Expand Up @@ -22,7 +22,7 @@ use crate::array::{BooleanArray, FromFfi};
use crate::error::{ArrowError, Result};
use crate::types::days_ms;
use crate::{
array::{Array, BinaryArray, ListArray, PrimitiveArray, StructArray, Utf8Array},
array::*,
datatypes::{DataType, IntervalUnit},
};

Expand All @@ -32,8 +32,8 @@ use crate::{
/// * the data type is not supported
/// * the interface is not valid (e.g. a null pointer)
pub fn try_from<A: ArrowArrayRef>(array: A) -> Result<Box<dyn Array>> {
let data_type = array.data_type()?;
let array: Box<dyn Array> = match data_type {
let field = array.field()?;
let array: Box<dyn Array> = match field.data_type() {
DataType::Boolean => Box::new(BooleanArray::try_from_ffi(array)?),
DataType::Int8 => Box::new(PrimitiveArray::<i8>::try_from_ffi(array)?),
DataType::Int16 => Box::new(PrimitiveArray::<i16>::try_from_ffi(array)?),
Expand Down Expand Up @@ -66,6 +66,17 @@ pub fn try_from<A: ArrowArrayRef>(array: A) -> Result<Box<dyn Array>> {
DataType::List(_) => Box::new(ListArray::<i32>::try_from_ffi(array)?),
DataType::LargeList(_) => Box::new(ListArray::<i64>::try_from_ffi(array)?),
DataType::Struct(_) => Box::new(StructArray::try_from_ffi(array)?),
DataType::Dictionary(keys, _) => match keys.as_ref() {
DataType::Int8 => Box::new(DictionaryArray::<i8>::try_from_ffi(array)?),
DataType::Int16 => Box::new(DictionaryArray::<i16>::try_from_ffi(array)?),
DataType::Int32 => Box::new(DictionaryArray::<i32>::try_from_ffi(array)?),
DataType::Int64 => Box::new(DictionaryArray::<i64>::try_from_ffi(array)?),
DataType::UInt8 => Box::new(DictionaryArray::<u8>::try_from_ffi(array)?),
DataType::UInt16 => Box::new(DictionaryArray::<u16>::try_from_ffi(array)?),
DataType::UInt32 => Box::new(DictionaryArray::<u32>::try_from_ffi(array)?),
DataType::UInt64 => Box::new(DictionaryArray::<u64>::try_from_ffi(array)?),
_ => unreachable!(),
},
data_type => {
return Err(ArrowError::NotYetImplemented(format!(
"Reading DataType \"{}\" is not yet supported.",
Expand All @@ -80,7 +91,6 @@ pub fn try_from<A: ArrowArrayRef>(array: A) -> Result<Box<dyn Array>> {
#[cfg(test)]
mod tests {
use super::*;
use crate::array::*;
use crate::datatypes::TimeUnit;
use crate::{error::Result, ffi};
use std::sync::Arc;
Expand Down Expand Up @@ -199,4 +209,16 @@ mod tests {

test_round_trip(array)
}

#[test]
fn test_dict() -> Result<()> {
let data = vec![Some("a"), Some("a"), None, Some("b")];

let mut array = MutableDictionaryArray::<i32, MutableUtf8Array<i32>>::new();
array.try_extend(data)?;

let array: DictionaryArray<i32> = array.into();

test_round_trip(array)
}
}