Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Improved dictionary invariants #1137

Merged
merged 1 commit into from
Jul 5, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 10 additions & 6 deletions src/array/dictionary/ffi.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use crate::{
array::{FromFfi, PrimitiveArray, ToFfi},
error::Result,
error::Error,
ffi,
};

Expand All @@ -25,16 +25,20 @@ unsafe impl<K: DictionaryKey> ToFfi for DictionaryArray<K> {
}

impl<K: DictionaryKey, A: ffi::ArrowArrayRef> FromFfi<A> for DictionaryArray<K> {
unsafe fn try_from_ffi(array: A) -> Result<Self> {
unsafe fn try_from_ffi(array: A) -> Result<Self, Error> {
// keys: similar to PrimitiveArray, but the datatype is the inner one
let validity = unsafe { array.validity() }?;
let values = unsafe { array.buffer::<K>(1) }?;

let data_type = K::PRIMITIVE.into();
let keys = PrimitiveArray::<K>::try_new(data_type, values, validity)?;
let values = array.dictionary()?.unwrap();
let data_type = array.data_type().clone();

let keys = PrimitiveArray::<K>::try_new(K::PRIMITIVE.into(), values, validity)?;
let values = array
.dictionary()?
.ok_or_else(|| Error::oos("Dictionary Array must contain a dictionary in ffi"))?;
let values = ffi::try_from(values)?;

Ok(DictionaryArray::<K>::from_data(keys, values))
// the assumption of this trait
DictionaryArray::<K>::try_new_unchecked(data_type, keys, values)
}
}
2 changes: 1 addition & 1 deletion src/array/dictionary/fmt.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ pub fn write_value<K: DictionaryKey, W: Write>(
let values = array.values();

if keys.is_valid(index) {
let key = keys.value(index).to_usize().unwrap();
let key = array.key_value(index);
get_display(values.as_ref(), null)(f, key)
} else {
write!(f, "{}", null)
Expand Down
17 changes: 1 addition & 16 deletions src/array/dictionary/iterator.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use crate::bitmap::utils::{zip_validity, ZipValidity};
use crate::bitmap::utils::ZipValidity;
use crate::scalar::Scalar;
use crate::trusted_len::TrustedLen;

Expand Down Expand Up @@ -66,18 +66,3 @@ impl<'a, K: DictionaryKey> IntoIterator for &'a DictionaryArray<K> {
self.iter()
}
}

impl<'a, K: DictionaryKey> DictionaryArray<K> {
/// Returns an iterator of `Option<Box<dyn Array>>`
pub fn iter(&'a self) -> ZipIter<'a, K> {
zip_validity(
DictionaryValuesIter::new(self),
self.keys.validity().as_ref().map(|x| x.iter()),
)
}

/// Returns an iterator of `Box<dyn Array>`
pub fn values_iter(&'a self) -> ValuesIter<'a, K> {
DictionaryValuesIter::new(self)
}
}
235 changes: 199 additions & 36 deletions src/array/dictionary/mod.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
use std::hint::unreachable_unchecked;

use crate::{
bitmap::Bitmap,
bitmap::{
utils::{zip_validity, ZipValidity},
Bitmap,
},
datatypes::{DataType, IntegerType},
error::Error,
scalar::{new_scalar, Scalar},
trusted_len::TrustedLen,
types::NativeType,
};

Expand All @@ -13,12 +20,23 @@ pub use iterator::*;
pub use mutable::*;

use super::{new_empty_array, primitive::PrimitiveArray, Array};
use crate::scalar::NullScalar;
use super::{new_null_array, specification::check_indexes};

/// Trait denoting [`NativeType`]s that can be used as keys of a dictionary.
pub trait DictionaryKey: NativeType + num_traits::NumCast + num_traits::FromPrimitive {
pub trait DictionaryKey: NativeType + TryInto<usize> + TryFrom<usize> {
/// The corresponding [`IntegerType`] of this key
const KEY_TYPE: IntegerType;

/// Represents this key as a `usize`.
/// # Safety
/// The caller _must_ have checked that the value can be casted to `usize`.
#[inline]
unsafe fn as_usize(self) -> usize {
match self.try_into() {
Ok(v) => v,
Err(_) => unreachable_unchecked(),
}
}
}

impl DictionaryKey for i8 {
Expand Down Expand Up @@ -46,47 +64,166 @@ impl DictionaryKey for u64 {
const KEY_TYPE: IntegerType = IntegerType::UInt64;
}

/// An [`Array`] whose values are encoded by keys. This [`Array`] is useful when the cardinality of
/// An [`Array`] whose values are stored as indices. This [`Array`] is useful when the cardinality of
/// values is low compared to the length of the [`Array`].
///
/// # Safety
/// This struct guarantees that each item of [`DictionaryArray::keys`] is castable to `usize` and
/// its value is smaller than [`DictionaryArray::values`]`.len()`. In other words, you can safely
/// use `unchecked` calls to retrive the values
#[derive(Clone)]
pub struct DictionaryArray<K: DictionaryKey> {
data_type: DataType,
keys: PrimitiveArray<K>,
values: Box<dyn Array>,
}

fn check_data_type(
key_type: IntegerType,
data_type: &DataType,
values_data_type: &DataType,
) -> Result<(), Error> {
if let DataType::Dictionary(key, value, _) = data_type.to_logical_type() {
if *key != key_type {
return Err(Error::oos(
"DictionaryArray must be initialized with a DataType::Dictionary whose integer is compatible to its keys",
));
}
if value.as_ref().to_logical_type() != values_data_type.to_logical_type() {
return Err(Error::oos(
"DictionaryArray must be initialized with a DataType::Dictionary whose value is equal to its values",
));
}
} else {
return Err(Error::oos(
"DictionaryArray must be initialized with logical DataType::Dictionary",
));
}
Ok(())
}

impl<K: DictionaryKey> DictionaryArray<K> {
/// Returns a new [`DictionaryArray`].
/// # Implementation
/// This function is `O(N)` where `N` is the length of keys
/// # Errors
/// This function errors iff
/// * the `data_type`'s logical type is not a `DictionaryArray`
/// * the `data_type`'s keys is not compatible with `keys`
/// * the `data_type`'s values's data_type is not equal with `values.data_type()`
/// * any of the keys's values is not represented in `usize` or is `>= values.len()`
pub fn try_new(
data_type: DataType,
keys: PrimitiveArray<K>,
values: Box<dyn Array>,
) -> Result<Self, Error> {
check_data_type(K::KEY_TYPE, &data_type, values.data_type())?;

check_indexes(keys.values(), values.len())?;

Ok(Self {
data_type,
keys,
values,
})
}

/// Returns a new [`DictionaryArray`].
/// # Implementation
/// This function is `O(N)` where `N` is the length of keys
/// # Errors
/// This function errors iff
/// * any of the keys's values is not represented in `usize` or is `>= values.len()`
pub fn try_from_keys(keys: PrimitiveArray<K>, values: Box<dyn Array>) -> Result<Self, Error> {
let data_type = Self::default_data_type(values.data_type().clone());
Self::try_new(data_type, keys, values)
}

/// Returns a new [`DictionaryArray`].
/// # Errors
/// This function errors iff
/// * the `data_type`'s logical type is not a `DictionaryArray`
/// * the `data_type`'s keys is not compatible with `keys`
/// * the `data_type`'s values's data_type is not equal with `values.data_type()`
/// # Safety
/// The caller must ensure that every keys's values is represented in `usize` and is `< values.len()`
pub unsafe fn try_new_unchecked(
data_type: DataType,
keys: PrimitiveArray<K>,
values: Box<dyn Array>,
) -> Result<Self, Error> {
check_data_type(K::KEY_TYPE, &data_type, values.data_type())?;

Ok(Self {
data_type,
keys,
values,
})
}

/// Returns a new empty [`DictionaryArray`].
pub fn new_empty(data_type: DataType) -> Self {
let values = Self::get_child(&data_type);
let values = Self::try_get_child(&data_type).unwrap();
let values = new_empty_array(values.clone());
let data_type = K::PRIMITIVE.into();
Self::from_data(PrimitiveArray::<K>::new_empty(data_type), values)
Self::try_new(
data_type,
PrimitiveArray::<K>::new_empty(K::PRIMITIVE.into()),
values,
)
.unwrap()
}

/// Returns an [`DictionaryArray`] whose all elements are null
#[inline]
pub fn new_null(data_type: DataType, length: usize) -> Self {
let values = Self::get_child(&data_type);
let data_type = K::PRIMITIVE.into();
Self::from_data(
PrimitiveArray::<K>::new_null(data_type, length),
new_empty_array(values.clone()),
let values = Self::try_get_child(&data_type).unwrap();
let values = new_null_array(values.clone(), 1);
Self::try_new(
data_type,
PrimitiveArray::<K>::new_null(K::PRIMITIVE.into(), length),
values,
)
.unwrap()
}

/// The canonical method to create a new [`DictionaryArray`].
pub fn from_data(keys: PrimitiveArray<K>, values: Box<dyn Array>) -> Self {
let data_type =
DataType::Dictionary(K::KEY_TYPE, Box::new(values.data_type().clone()), false);
/// Returns an iterator of [`Option<Box<dyn Scalar>>`].
/// # Implementation
/// This function will allocate a new [`Scalar`] per item and is usually not performant.
/// Consider calling `keys_iter` and `values`, downcasting `values`, and iterating over that.
pub fn iter(&self) -> ZipValidity<Box<dyn Scalar>, DictionaryValuesIter<K>> {
zip_validity(
DictionaryValuesIter::new(self),
self.keys.validity().as_ref().map(|x| x.iter()),
)
}

Self {
data_type,
keys,
values,
/// Returns an iterator of [`Box<dyn Scalar>`]
/// # Implementation
/// This function will allocate a new [`Scalar`] per item and is usually not performant.
/// Consider calling `keys_iter` and `values`, downcasting `values`, and iterating over that.
pub fn values_iter(&self) -> DictionaryValuesIter<K> {
DictionaryValuesIter::new(self)
}

/// Returns the [`DataType`] of this [`DictionaryArray`]
#[inline]
pub fn data_type(&self) -> &DataType {
&self.data_type
}

/// Returns whether the values of this [`DictionaryArray`] are ordered
#[inline]
pub fn is_ordered(&self) -> bool {
match self.data_type.to_logical_type() {
DataType::Dictionary(_, _, is_ordered) => *is_ordered,
_ => unreachable!(),
}
}

pub(crate) fn default_data_type(values_datatype: DataType) -> DataType {
DataType::Dictionary(K::KEY_TYPE, Box::new(values_datatype), false)
}

/// Creates a new [`DictionaryArray`] by slicing the existing [`DictionaryArray`].
/// # Panics
/// iff `offset + length > self.len()`.
Expand Down Expand Up @@ -124,10 +261,7 @@ impl<K: DictionaryKey> DictionaryArray<K> {
pub fn set_validity(&mut self, validity: Option<Bitmap>) {
self.keys.set_validity(validity);
}
}

// accessors
impl<K: DictionaryKey> DictionaryArray<K> {
/// Returns the length of this array
#[inline]
pub fn len(&self) -> usize {
Expand All @@ -147,21 +281,46 @@ impl<K: DictionaryKey> DictionaryArray<K> {
&self.keys
}

/// Returns an iterator of the keys' values of the [`DictionaryArray`] as `usize`
#[inline]
pub fn keys_values_iter(&self) -> impl TrustedLen<Item = usize> + Clone + '_ {
// safety - invariant of the struct
self.keys.values_iter().map(|x| unsafe { x.as_usize() })
}

/// Returns an iterator of the keys' of the [`DictionaryArray`] as `usize`
#[inline]
pub fn keys_iter(&self) -> impl TrustedLen<Item = Option<usize>> + Clone + '_ {
// safety - invariant of the struct
self.keys.iter().map(|x| x.map(|x| unsafe { x.as_usize() }))
}

/// Returns the keys' value of the [`DictionaryArray`] as `usize`
/// # Panics
/// This function panics iff `index >= self.len()`
#[inline]
pub fn key_value(&self, index: usize) -> usize {
// safety - invariant of the struct
unsafe { self.keys.values()[index].as_usize() }
}

/// Returns the values of the [`DictionaryArray`].
#[inline]
pub fn values(&self) -> &Box<dyn Array> {
&self.values
}

/// Returns the value of the [`DictionaryArray`] at position `i`.
/// # Implementation
/// This function will allocate a new [`Scalar`] and is usually not performant.
/// Consider calling `keys` and `values`, downcasting `values`, and iterating over that.
/// # Panic
/// This function panics iff `index >= self.len()`
#[inline]
pub fn value(&self, index: usize) -> Box<dyn Scalar> {
if self.keys.is_null(index) {
Box::new(NullScalar::new())
} else {
let index = self.keys.value(index).to_usize().unwrap();
new_scalar(self.values.as_ref(), index)
}
// safety - invariant of this struct
let index = unsafe { self.keys.value(index).as_usize() };
new_scalar(self.values.as_ref(), index)
}

/// Boxes self into a [`Box<dyn Array>`].
Expand All @@ -173,15 +332,16 @@ impl<K: DictionaryKey> DictionaryArray<K> {
pub fn arced(self) -> std::sync::Arc<dyn Array> {
std::sync::Arc::new(self)
}
}

impl<K: DictionaryKey> DictionaryArray<K> {
pub(crate) fn get_child(data_type: &DataType) -> &DataType {
match data_type {
pub(crate) fn try_get_child(data_type: &DataType) -> Result<&DataType, Error> {
Ok(match data_type.to_logical_type() {
DataType::Dictionary(_, values, _) => values.as_ref(),
DataType::Extension(_, inner, _) => Self::get_child(inner),
_ => panic!("DictionaryArray must be initialized with DataType::Dictionary"),
}
_ => {
return Err(Error::oos(
"Dictionaries must be initialized with DataType::Dictionary",
))
}
})
}
}

Expand Down Expand Up @@ -213,12 +373,15 @@ impl<K: DictionaryKey> Array for DictionaryArray<K> {
fn slice(&self, offset: usize, length: usize) -> Box<dyn Array> {
Box::new(self.slice(offset, length))
}

unsafe fn slice_unchecked(&self, offset: usize, length: usize) -> Box<dyn Array> {
Box::new(self.slice_unchecked(offset, length))
}

fn with_validity(&self, validity: Option<Bitmap>) -> Box<dyn Array> {
Box::new(self.clone().with_validity(validity))
}

fn to_boxed(&self) -> Box<dyn Array> {
Box::new(self.clone())
}
Expand Down
Loading