Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Commit

Permalink
Added support for DataType::Map and MapArray (#464)
Browse files Browse the repository at this point in the history
Co-authored-by: Wakahisa <nevilledips@gmail.com>
  • Loading branch information
jorgecarleitao and nevi-me committed Oct 2, 2021
1 parent 2d84c2d commit f7f6186
Show file tree
Hide file tree
Showing 31 changed files with 658 additions and 38 deletions.
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,6 @@ we also use the `0.x.y` versioning, since we are iterating over the API.
## Features in the original not available in this crate

* Parquet read and write of struct and nested lists.
* Map type

## Features in this crate not in pyarrow

Expand All @@ -86,7 +85,7 @@ we also use the `0.x.y` versioning, since we are iterating over the API.

## Features in pyarrow not in this crate

Too many to enumerate; e.g. nested dictionary arrays, map, nested parquet.
Too many to enumerate; e.g. nested dictionary arrays, nested parquet.

## FAQ

Expand Down
17 changes: 17 additions & 0 deletions arrow-pyarrow-integration-testing/tests/test_sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,23 @@ def test_dict(self):
assert a.to_pylist() == b.to_pylist()
assert a.type == b.type

def test_map(self):
"""
Python -> Rust -> Python
"""
offsets = [0, None, 2, 6]
pykeys = [b"a", b"b", b"c", b"d", b"e", b"f"]
pyitems = [1, 2, 3, None, 4, 5]
keys = pyarrow.array(pykeys, type="binary")
items = pyarrow.array(pyitems, type="i4")

a = pyarrow.MapArray.from_arrays(offsets, keys, items)
b = arrow_pyarrow_integration_testing.round_trip_array(a)

b.validate(full=True)
assert a.to_pylist() == b.to_pylist()
assert a.type == b.type

def test_sparse_union(self):
"""
Python -> Rust -> Python
Expand Down
23 changes: 19 additions & 4 deletions integration-testing/unskip.patch
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py
index d0c4b3d6c..936351c80 100644
index d0c4b3d6c..0ce29fb8a 100644
--- a/dev/archery/archery/integration/datagen.py
+++ b/dev/archery/archery/integration/datagen.py
@@ -1568,8 +1568,7 @@ def get_generated_json_files(tempdir=None):
Expand All @@ -12,7 +12,7 @@ index d0c4b3d6c..936351c80 100644

generate_decimal256_case()
.skip_category('Go') # TODO(ARROW-7948): Decimal + Go
@@ -1579,13 +1578,11 @@ def get_generated_json_files(tempdir=None):
@@ -1579,17 +1578,14 @@ def get_generated_json_files(tempdir=None):
generate_datetime_case(),

generate_interval_case()
Expand All @@ -27,8 +27,13 @@ index d0c4b3d6c..936351c80 100644
+ .skip_category('JS'),


generate_map_case()
@@ -1603,13 +1600,11 @@ def get_generated_json_files(tempdir=None):
- generate_map_case()
- .skip_category('Rust'),
+ generate_map_case(),

generate_non_canonical_map_case()
.skip_category('Java') # TODO(ARROW-8715)
@@ -1603,13 +1599,11 @@ def get_generated_json_files(tempdir=None):

generate_nested_large_offsets_case()
.skip_category('Go')
Expand All @@ -44,3 +49,13 @@ index d0c4b3d6c..936351c80 100644

generate_custom_metadata_case()
.skip_category('JS'),
@@ -1634,8 +1628,7 @@ def get_generated_json_files(tempdir=None):

generate_extension_case()
.skip_category('Go') # TODO(ARROW-3039): requires dictionaries
- .skip_category('JS')
- .skip_category('Rust'),
+ .skip_category('JS'),
]

generated_paths = []
1 change: 1 addition & 0 deletions src/array/display.rs
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,7 @@ pub fn get_value_display<'a>(array: &'a dyn Array) -> Box<dyn Fn(usize) -> Strin
DataType::UInt64 => dyn_dict!(array, u64),
_ => unreachable!(),
},
Map(_, _) => todo!(),
Struct(_) => {
let a = array.as_any().downcast_ref::<StructArray>().unwrap();
let displays = a
Expand Down
5 changes: 5 additions & 0 deletions src/array/equal/map.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
use crate::array::{Array, MapArray};

pub(super) fn equal(lhs: &MapArray, rhs: &MapArray) -> bool {
lhs.data_type() == rhs.data_type() && lhs.len() == rhs.len() && lhs.iter().eq(rhs.iter())
}
6 changes: 6 additions & 0 deletions src/array/equal/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ mod dictionary;
mod fixed_size_binary;
mod fixed_size_list;
mod list;
mod map;
mod null;
mod primitive;
mod struct_;
Expand Down Expand Up @@ -235,5 +236,10 @@ pub fn equal(lhs: &dyn Array, rhs: &dyn Array) -> bool {
let rhs = rhs.as_any().downcast_ref().unwrap();
union::equal(lhs, rhs)
}
Map => {
let lhs = lhs.as_any().downcast_ref().unwrap();
let rhs = rhs.as_any().downcast_ref().unwrap();
map::equal(lhs, rhs)
}
}
}
1 change: 1 addition & 0 deletions src/array/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ pub fn buffers_children_dictionary(array: &dyn Array) -> BuffersChildren {
FixedSizeList => ffi_dyn!(array, FixedSizeListArray),
Struct => ffi_dyn!(array, StructArray),
Union => ffi_dyn!(array, UnionArray),
Map => ffi_dyn!(array, MapArray),
Dictionary(key_type) => {
with_match_physical_dictionary_key_type!(key_type, |$T| {
let array = array.as_any().downcast_ref::<DictionaryArray<$T>>().unwrap();
Expand Down
2 changes: 1 addition & 1 deletion src/array/growable/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ pub fn make_growable<'a>(
use_validity,
capacity
),
Union => todo!(),
Union | Map => todo!(),
Dictionary(key_type) => {
with_match_physical_dictionary_key_type!(key_type, |$T| {
dyn_dict_growable!($T, arrays, use_validity, capacity)
Expand Down
21 changes: 8 additions & 13 deletions src/array/list/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -129,19 +129,14 @@ impl<O: Offset> ListArray<O> {
/// Returns the element at index `i`
#[inline]
pub fn value(&self, i: usize) -> Box<dyn Array> {
if self.is_null(i) {
new_empty_array(self.values.data_type().clone())
} else {
let offsets = self.offsets.as_slice();
let offset = offsets[i];
let offset_1 = offsets[i + 1];
let length = (offset_1 - offset).to_usize();

// Safety:
// One of the invariants of the struct
// is that offsets are in bounds
unsafe { self.values.slice_unchecked(offset.to_usize(), length) }
}
let offset = self.offsets[i];
let offset_1 = self.offsets[i + 1];
let length = (offset_1 - offset).to_usize();

// Safety:
// One of the invariants of the struct
// is that offsets are in bounds
unsafe { self.values.slice_unchecked(offset.to_usize(), length) }
}

/// Returns the element at index `i` as &str
Expand Down
41 changes: 41 additions & 0 deletions src/array/map/ffi.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
use std::sync::Arc;

use crate::{array::FromFfi, error::Result, ffi};

use super::super::{ffi::ToFfi, Array};
use super::MapArray;

unsafe impl ToFfi for MapArray {
fn buffers(&self) -> Vec<Option<std::ptr::NonNull<u8>>> {
vec![
self.validity.as_ref().map(|x| x.as_ptr()),
std::ptr::NonNull::new(self.offsets.as_ptr() as *mut u8),
]
}

fn offset(&self) -> usize {
self.offset
}

fn children(&self) -> Vec<Arc<dyn Array>> {
vec![self.field.clone()]
}
}

impl<A: ffi::ArrowArrayRef> FromFfi<A> for MapArray {
unsafe fn try_from_ffi(array: A) -> Result<Self> {
let data_type = array.field().data_type().clone();
let length = array.array().len();
let offset = array.array().offset();
let mut validity = unsafe { array.validity() }?;
let mut offsets = unsafe { array.buffer::<i32>(0) }?;
let child = array.child(0)?;
let values = ffi::try_from(child)?.into();

if offset > 0 {
offsets = offsets.slice(offset, length);
validity = validity.map(|x| x.slice(offset, length))
}
Ok(Self::from_data(data_type, offsets, values, validity))
}
}
85 changes: 85 additions & 0 deletions src/array/map/iterator.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
use crate::array::Array;
use crate::bitmap::utils::{zip_validity, ZipValidity};
use crate::trusted_len::TrustedLen;

use super::MapArray;

/// Iterator of values of an [`ListArray`].
#[derive(Clone, Debug)]
pub struct MapValuesIter<'a> {
array: &'a MapArray,
index: usize,
end: usize,
}

impl<'a> MapValuesIter<'a> {
#[inline]
pub fn new(array: &'a MapArray) -> Self {
Self {
array,
index: 0,
end: array.len(),
}
}
}

impl<'a> Iterator for MapValuesIter<'a> {
type Item = Box<dyn Array>;

#[inline]
fn next(&mut self) -> Option<Self::Item> {
if self.index == self.end {
return None;
}
let old = self.index;
self.index += 1;
// Safety:
// self.end is maximized by the length of the array
Some(unsafe { self.array.value_unchecked(old) })
}

#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
(self.end - self.index, Some(self.end - self.index))
}
}

unsafe impl<'a> TrustedLen for MapValuesIter<'a> {}

impl<'a> DoubleEndedIterator for MapValuesIter<'a> {
#[inline]
fn next_back(&mut self) -> Option<Self::Item> {
if self.index == self.end {
None
} else {
self.end -= 1;
// Safety:
// self.end is maximized by the length of the array
Some(unsafe { self.array.value_unchecked(self.end) })
}
}
}

impl<'a> IntoIterator for &'a MapArray {
type Item = Option<Box<dyn Array>>;
type IntoIter = ZipValidity<'a, Box<dyn Array>, MapValuesIter<'a>>;

fn into_iter(self) -> Self::IntoIter {
self.iter()
}
}

impl<'a> MapArray {
/// Returns an iterator of `Option<Box<dyn Array>>`
pub fn iter(&'a self) -> ZipValidity<'a, Box<dyn Array>, MapValuesIter<'a>> {
zip_validity(
MapValuesIter::new(self),
self.validity.as_ref().map(|x| x.iter()),
)
}

/// Returns an iterator of `Box<dyn Array>`
pub fn values_iter(&'a self) -> MapValuesIter<'a> {
MapValuesIter::new(self)
}
}
Loading

0 comments on commit f7f6186

Please sign in to comment.