From 451680d5ba5f112823551699347d44c3441b8ff0 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Sat, 4 Jun 2022 05:40:49 +0000 Subject: [PATCH] Cleaned docs for BinaryArray --- src/array/binary/from.rs | 78 +------- src/array/binary/iterator.rs | 21 +- src/array/binary/mod.rs | 373 ++++++++++++++++++++++------------- src/array/equal/binary.rs | 2 +- src/array/utf8/mod.rs | 14 +- 5 files changed, 246 insertions(+), 242 deletions(-) diff --git a/src/array/binary/from.rs b/src/array/binary/from.rs index c4993a95828..aa575ccd9cc 100644 --- a/src/array/binary/from.rs +++ b/src/array/binary/from.rs @@ -1,88 +1,12 @@ use std::iter::FromIterator; -use crate::{array::Offset, trusted_len::TrustedLen}; +use crate::array::Offset; use super::{BinaryArray, MutableBinaryArray}; -impl BinaryArray { - /// Creates a new [`BinaryArray`] from slices of `&[u8]`. - pub fn from_slice, P: AsRef<[T]>>(slice: P) -> Self { - Self::from_trusted_len_values_iter(slice.as_ref().iter()) - } - - /// Creates a new [`BinaryArray`] from a slice of optional `&[u8]`. - // Note: this can't be `impl From` because Rust does not allow double `AsRef` on it. - pub fn from, P: AsRef<[Option]>>(slice: P) -> Self { - Self::from_trusted_len_iter(slice.as_ref().iter().map(|x| x.as_ref())) - } - - /// Creates a [`BinaryArray`] from an iterator of trusted length. - #[inline] - pub fn from_trusted_len_values_iter, I: TrustedLen>( - iterator: I, - ) -> Self { - MutableBinaryArray::::from_trusted_len_values_iter(iterator).into() - } - - /// Creates a new [`BinaryArray`] from a [`Iterator`] of `&str`. - pub fn from_iter_values, I: Iterator>(iterator: I) -> Self { - MutableBinaryArray::::from_iter_values(iterator).into() - } -} - impl> FromIterator> for BinaryArray { #[inline] fn from_iter>>(iter: I) -> Self { MutableBinaryArray::::from_iter(iter).into() } } - -impl BinaryArray { - /// Creates a [`BinaryArray`] from an iterator of trusted length. - /// # Safety - /// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html). - /// I.e. that `size_hint().1` correctly reports its length. - #[inline] - pub unsafe fn from_trusted_len_iter_unchecked(iterator: I) -> Self - where - P: AsRef<[u8]>, - I: Iterator>, - { - MutableBinaryArray::::from_trusted_len_iter_unchecked(iterator).into() - } - - /// Creates a [`BinaryArray`] from an iterator of trusted length. - #[inline] - pub fn from_trusted_len_iter(iterator: I) -> Self - where - P: AsRef<[u8]>, - I: TrustedLen>, - { - // soundness: I is `TrustedLen` - unsafe { Self::from_trusted_len_iter_unchecked(iterator) } - } - - /// Creates a [`BinaryArray`] from an falible iterator of trusted length. - /// # Safety - /// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html). - /// I.e. that `size_hint().1` correctly reports its length. - #[inline] - pub unsafe fn try_from_trusted_len_iter_unchecked(iterator: I) -> Result - where - P: AsRef<[u8]>, - I: IntoIterator, E>>, - { - MutableBinaryArray::::try_from_trusted_len_iter_unchecked(iterator).map(|x| x.into()) - } - - /// Creates a [`BinaryArray`] from an fallible iterator of trusted length. - #[inline] - pub fn try_from_trusted_len_iter(iter: I) -> Result - where - P: AsRef<[u8]>, - I: TrustedLen, E>>, - { - // soundness: I: TrustedLen - unsafe { Self::try_from_trusted_len_iter_unchecked(iter) } - } -} diff --git a/src/array/binary/iterator.rs b/src/array/binary/iterator.rs index 3dc085ab220..10f0e8a43de 100644 --- a/src/array/binary/iterator.rs +++ b/src/array/binary/iterator.rs @@ -1,8 +1,4 @@ -use crate::{ - array::Offset, - bitmap::utils::{zip_validity, ZipValidity}, - trusted_len::TrustedLen, -}; +use crate::{array::Offset, bitmap::utils::ZipValidity, trusted_len::TrustedLen}; use super::BinaryArray; @@ -50,19 +46,4 @@ impl<'a, O: Offset> IntoIterator for &'a BinaryArray { } } -impl<'a, O: Offset> BinaryArray { - /// Returns an iterator of `Option<&[u8]>` - pub fn iter(&'a self) -> ZipValidity<'a, &'a [u8], BinaryValueIter<'a, O>> { - zip_validity( - BinaryValueIter::new(self), - self.validity.as_ref().map(|x| x.iter()), - ) - } - - /// Returns an iterator of `&[u8]` - pub fn values_iter(&'a self) -> BinaryValueIter<'a, O> { - BinaryValueIter::new(self) - } -} - unsafe impl TrustedLen for BinaryValueIter<'_, O> {} diff --git a/src/array/binary/mod.rs b/src/array/binary/mod.rs index 9f3c5dad675..ff6fb609ab8 100644 --- a/src/array/binary/mod.rs +++ b/src/array/binary/mod.rs @@ -1,8 +1,12 @@ use crate::{ - bitmap::Bitmap, + bitmap::{ + utils::{zip_validity, ZipValidity}, + Bitmap, + }, buffer::Buffer, datatypes::DataType, - error::{Error, Result}, + error::Error, + trusted_len::TrustedLen, }; use super::{ @@ -18,7 +22,32 @@ mod from; mod mutable; pub use mutable::*; -/// A [`BinaryArray`] is a nullable array of bytes - the Arrow equivalent of `Vec>>`. +/// A [`BinaryArray`] is Arrow's semantically equivalent of an immutable `Vec>>`. +/// It implements [`Array`]. +/// +/// The size of this struct is `O(1)`, as all data is stored behind an [`std::sync::Arc`]. +/// # Example +/// ``` +/// use arrow2::array::BinaryArray; +/// use arrow2::bitmap::Bitmap; +/// use arrow2::buffer::Buffer; +/// +/// let array = BinaryArray::::from([Some([1, 2].as_ref()), None, Some([3].as_ref())]); +/// assert_eq!(array.value(0), &[1, 2]); +/// assert_eq!(array.iter().collect::>(), vec![Some([1, 2].as_ref()), None, Some([3].as_ref())]); +/// assert_eq!(array.values_iter().collect::>(), vec![[1, 2].as_ref(), &[], &[3]]); +/// // the underlying representation: +/// assert_eq!(array.values(), &Buffer::from(vec![1, 2, 3])); +/// assert_eq!(array.offsets(), &Buffer::from(vec![0, 2, 2, 3])); +/// assert_eq!(array.validity(), Some(&Bitmap::from([true, false, true]))); +/// ``` +/// +/// # Generic parameter +/// The generic parameter [`Offset`] can only be `i32` or `i64` and tradeoffs maximum array length with +/// memory usage: +/// * the sum of lengths of all elements cannot exceed `Offset::MAX` +/// * the total size of the underlying data is `array.len() * size_of::() + sum of lengths of all elements` +/// /// # Safety /// The following invariants hold: /// * Two consecutives `offsets` casted (`as`) to `usize` are valid slices of `values`. @@ -31,9 +60,8 @@ pub struct BinaryArray { validity: Option, } -// constructors impl BinaryArray { - /// Creates a new [`BinaryArray`]. + /// Returns a [`BinaryArray`] created from its internal representation. /// /// # Errors /// This function returns an error iff: @@ -48,7 +76,7 @@ impl BinaryArray { offsets: Buffer, values: Buffer, validity: Option, - ) -> Result { + ) -> Result { try_check_offsets(&offsets, values.len())?; if validity @@ -74,31 +102,134 @@ impl BinaryArray { }) } - /// Creates a new [`BinaryArray`]. + /// Creates a new [`BinaryArray`] from slices of `&[u8]`. + pub fn from_slice, P: AsRef<[T]>>(slice: P) -> Self { + Self::from_trusted_len_values_iter(slice.as_ref().iter()) + } + + /// Creates a new [`BinaryArray`] from a slice of optional `&[u8]`. + // Note: this can't be `impl From` because Rust does not allow double `AsRef` on it. + pub fn from, P: AsRef<[Option]>>(slice: P) -> Self { + Self::from_trusted_len_iter(slice.as_ref().iter().map(|x| x.as_ref())) + } + + /// Returns an iterator of `Option<&[u8]>` over every element of this array. + pub fn iter(&self) -> ZipValidity<&[u8], BinaryValueIter> { + zip_validity(self.values_iter(), self.validity.as_ref().map(|x| x.iter())) + } + + /// Returns an iterator of `&[u8]` over every element of this array, ignoring the validity + pub fn values_iter(&self) -> BinaryValueIter { + BinaryValueIter::new(self) + } + + /// Returns the length of this array + #[inline] + pub fn len(&self) -> usize { + self.offsets.len() - 1 + } + + /// Returns the element at index `i` /// # Panics - /// * the offsets are not monotonically increasing - /// * The last offset is not equal to the values' length. - /// * the validity's length is not equal to `offsets.len() - 1`. - /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Binary` or `LargeBinary`. + /// iff `i >= self.len()` + #[inline] + pub fn value(&self, i: usize) -> &[u8] { + assert!(i < self.len()); + unsafe { self.value_unchecked(i) } + } + + /// Returns the element at index `i` + /// # Safety + /// Assumes that the `i < self.len`. + #[inline] + pub unsafe fn value_unchecked(&self, i: usize) -> &[u8] { + // soundness: the invariant of the function + let start = self.offsets.get_unchecked(i).to_usize(); + let end = self.offsets.get_unchecked(i + 1).to_usize(); + + // soundness: the invariant of the struct + self.values.get_unchecked(start..end) + } + + /// Returns the [`DataType`] of this array. + #[inline] + pub fn data_type(&self) -> &DataType { + &self.data_type + } + + /// Returns the values of this [`Utf8Array`]. + #[inline] + pub fn values(&self) -> &Buffer { + &self.values + } + + /// Returns the offsets of this [`Utf8Array`]. + #[inline] + pub fn offsets(&self) -> &Buffer { + &self.offsets + } + + /// The optional validity. + #[inline] + pub fn validity(&self) -> Option<&Bitmap> { + self.validity.as_ref() + } + + /// Creates a new [`BinaryArray`] by slicing this [`BinaryArray`]. /// # Implementation - /// This function is `O(N)` - checking monotinicity is `O(N)` - pub fn new( - data_type: DataType, - offsets: Buffer, - values: Buffer, - validity: Option, - ) -> Self { - Self::try_new(data_type, offsets, values, validity).unwrap() + /// This function is `O(1)`: all data will be shared between both arrays. + /// # Panics + /// iff `offset + length > self.len()`. + #[must_use] + pub fn slice(&self, offset: usize, length: usize) -> Self { + assert!( + offset + length <= self.len(), + "the offset of the new Buffer cannot exceed the existing length" + ); + unsafe { self.slice_unchecked(offset, length) } } - /// Alias for `new` - pub fn from_data( - data_type: DataType, - offsets: Buffer, - values: Buffer, - validity: Option, - ) -> Self { - Self::new(data_type, offsets, values, validity) + /// Creates a new [`BinaryArray`] by slicing this [`BinaryArray`]. + /// # Implementation + /// This function is `O(1)`: all data will be shared between both arrays. + /// # Safety + /// The caller must ensure that `offset + length <= self.len()`. + #[must_use] + pub unsafe fn slice_unchecked(&self, offset: usize, length: usize) -> Self { + let validity = self + .validity + .clone() + .map(|x| x.slice_unchecked(offset, length)); + let offsets = self.offsets.clone().slice_unchecked(offset, length + 1); + Self { + data_type: self.data_type.clone(), + offsets, + values: self.values.clone(), + validity, + } + } + + /// Boxes self into a [`Box`]. + pub fn boxed(self) -> Box { + Box::new(self) + } + + /// Boxes self into a [`std::sync::Arc`]. + pub fn arced(self) -> std::sync::Arc { + std::sync::Arc::new(self) + } + + /// Clones this [`BinaryArray`] with a different validity. + /// # Panic + /// Panics iff `validity.len() != self.len()`. + #[must_use] + pub fn with_validity(&self, validity: Option) -> Self { + if matches!(&validity, Some(bitmap) if bitmap.len() != self.len()) { + panic!("validity's length must be equal to the array's length") + } + let mut arr = self.clone(); + arr.validity = validity; + arr } /// Creates an empty [`BinaryArray`], i.e. whose `.len` is zero. @@ -131,19 +262,6 @@ impl BinaryArray { } } - /// Boxes self into a [`Box`]. - pub fn boxed(self) -> Box { - Box::new(self) - } - - /// Boxes self into a [`std::sync::Arc`]. - pub fn arced(self) -> std::sync::Arc { - std::sync::Arc::new(self) - } -} - -// unsafe constructors -impl BinaryArray { /// Creates a new [`BinaryArray`] without checking for offsets monotinicity. /// /// # Errors @@ -161,7 +279,7 @@ impl BinaryArray { offsets: Buffer, values: Buffer, validity: Option, - ) -> Result { + ) -> Result { try_check_offsets_bounds(&offsets, values.len())?; if validity @@ -187,138 +305,115 @@ impl BinaryArray { }) } - /// Creates a new [`BinaryArray`] without checking for offsets monotinicity. - /// - /// # Panics - /// This function returns an error iff: - /// * The last offset is not equal to the values' length. - /// * the validity's length is not equal to `offsets.len() - 1`. - /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Binary` or `LargeBinary`. - /// # Safety - /// This function is unsafe iff: - /// * the offsets are not monotonically increasing - /// # Implementation - /// This function is `O(1)` - pub unsafe fn new_unchecked( + /// Alias for unwrapping [`Self::try_new`] + pub fn new( data_type: DataType, offsets: Buffer, values: Buffer, validity: Option, ) -> Self { - Self::try_new_unchecked(data_type, offsets, values, validity).unwrap() + Self::try_new(data_type, offsets, values, validity).unwrap() } - /// Alias for [`Self::new_unchecked`] + /// Alias for unwrapping [`Self::try_new_unchecked`] /// # Safety /// This function is unsafe iff: /// * the offsets are not monotonically increasing - pub unsafe fn from_data_unchecked( + pub unsafe fn new_unchecked( data_type: DataType, offsets: Buffer, values: Buffer, validity: Option, ) -> Self { - Self::new_unchecked(data_type, offsets, values, validity) - } -} - -// must use -impl BinaryArray { - /// Creates a new [`BinaryArray`] by slicing this [`BinaryArray`]. - /// # Implementation - /// This function is `O(1)`: all data will be shared between both arrays. - /// # Panics - /// iff `offset + length > self.len()`. - #[must_use] - pub fn slice(&self, offset: usize, length: usize) -> Self { - assert!( - offset + length <= self.len(), - "the offset of the new Buffer cannot exceed the existing length" - ); - unsafe { self.slice_unchecked(offset, length) } + Self::try_new_unchecked(data_type, offsets, values, validity).unwrap() } - /// Creates a new [`BinaryArray`] by slicing this [`BinaryArray`]. - /// # Implementation - /// This function is `O(1)`: all data will be shared between both arrays. - /// # Safety - /// The caller must ensure that `offset + length <= self.len()`. - #[must_use] - pub unsafe fn slice_unchecked(&self, offset: usize, length: usize) -> Self { - let validity = self - .validity - .clone() - .map(|x| x.slice_unchecked(offset, length)); - let offsets = self.offsets.clone().slice_unchecked(offset, length + 1); - Self { - data_type: self.data_type.clone(), - offsets, - values: self.values.clone(), - validity, - } + /// Returns a [`BinaryArray`] from an iterator of trusted length. + /// + /// The [`BinaryArray`] is guaranteed to not have a validity + #[inline] + pub fn from_trusted_len_values_iter, I: TrustedLen>( + iterator: I, + ) -> Self { + MutableBinaryArray::::from_trusted_len_values_iter(iterator).into() } - /// Clones this [`BinaryArray`] with a different validity. - /// # Panic - /// Panics iff `validity.len() != self.len()`. - #[must_use] - pub fn with_validity(&self, validity: Option) -> Self { - if matches!(&validity, Some(bitmap) if bitmap.len() != self.len()) { - panic!("validity's length must be equal to the array's length") - } - let mut arr = self.clone(); - arr.validity = validity; - arr + /// Returns a new [`BinaryArray`] from a [`Iterator`] of `&[u8]`. + /// + /// The [`BinaryArray`] is guaranteed to not have a validity + pub fn from_iter_values, I: Iterator>(iterator: I) -> Self { + MutableBinaryArray::::from_iter_values(iterator).into() } -} -// accessors -impl BinaryArray { - /// Returns the length of this array + /// Creates a [`BinaryArray`] from an iterator of trusted length. + /// # Safety + /// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html). + /// I.e. that `size_hint().1` correctly reports its length. #[inline] - pub fn len(&self) -> usize { - self.offsets.len() - 1 + pub unsafe fn from_trusted_len_iter_unchecked(iterator: I) -> Self + where + P: AsRef<[u8]>, + I: Iterator>, + { + MutableBinaryArray::::from_trusted_len_iter_unchecked(iterator).into() } - /// Returns the element at index `i` - /// # Panics - /// iff `i >= self.len()` - pub fn value(&self, i: usize) -> &[u8] { - let start = self.offsets[i].to_usize(); - let end = self.offsets[i + 1].to_usize(); - - // soundness: the invariant of the struct - unsafe { self.values.get_unchecked(start..end) } + /// Creates a [`BinaryArray`] from a [`TrustedLen`] + #[inline] + pub fn from_trusted_len_iter(iterator: I) -> Self + where + P: AsRef<[u8]>, + I: TrustedLen>, + { + // soundness: I is `TrustedLen` + unsafe { Self::from_trusted_len_iter_unchecked(iterator) } } - /// Returns the element at index `i` + /// Creates a [`BinaryArray`] from an falible iterator of trusted length. /// # Safety - /// Assumes that the `i < self.len`. - pub unsafe fn value_unchecked(&self, i: usize) -> &[u8] { - // soundness: the invariant of the function - let start = self.offsets.get_unchecked(i).to_usize(); - let end = self.offsets.get_unchecked(i + 1).to_usize(); - - // soundness: the invariant of the struct - self.values.get_unchecked(start..end) + /// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html). + /// I.e. that `size_hint().1` correctly reports its length. + #[inline] + pub unsafe fn try_from_trusted_len_iter_unchecked(iterator: I) -> Result + where + P: AsRef<[u8]>, + I: IntoIterator, E>>, + { + MutableBinaryArray::::try_from_trusted_len_iter_unchecked(iterator).map(|x| x.into()) } - /// The optional validity. + /// Creates a [`BinaryArray`] from an fallible iterator of trusted length. #[inline] - pub fn validity(&self) -> Option<&Bitmap> { - self.validity.as_ref() + pub fn try_from_trusted_len_iter(iter: I) -> Result + where + P: AsRef<[u8]>, + I: TrustedLen, E>>, + { + // soundness: I: TrustedLen + unsafe { Self::try_from_trusted_len_iter_unchecked(iter) } } - /// Returns the offsets that slice `.values()` to return valid values. - #[inline] - pub fn offsets(&self) -> &Buffer { - &self.offsets + /// Alias for [`Self::new_unchecked`] + /// # Safety + /// This function is unsafe iff: + /// * the offsets are not monotonically increasing + pub unsafe fn from_data_unchecked( + data_type: DataType, + offsets: Buffer, + values: Buffer, + validity: Option, + ) -> Self { + Self::new_unchecked(data_type, offsets, values, validity) } - /// Returns all values in this array. Use `.offsets()` to slice them. - #[inline] - pub fn values(&self) -> &Buffer { - &self.values + /// Alias for `new` + pub fn from_data( + data_type: DataType, + offsets: Buffer, + values: Buffer, + validity: Option, + ) -> Self { + Self::new(data_type, offsets, values, validity) } } diff --git a/src/array/equal/binary.rs b/src/array/equal/binary.rs index 036f0693495..1c86fab6dce 100644 --- a/src/array/equal/binary.rs +++ b/src/array/equal/binary.rs @@ -1,4 +1,4 @@ -use crate::array::{Array, BinaryArray, Offset}; +use crate::array::{BinaryArray, Offset}; pub(super) fn equal(lhs: &BinaryArray, rhs: &BinaryArray) -> bool { lhs.data_type() == rhs.data_type() && lhs.len() == rhs.len() && lhs.iter().eq(rhs.iter()) diff --git a/src/array/utf8/mod.rs b/src/array/utf8/mod.rs index 088e77076f6..5d89fe4aabb 100644 --- a/src/array/utf8/mod.rs +++ b/src/array/utf8/mod.rs @@ -42,6 +42,13 @@ pub use mutable::*; /// assert_eq!(array.offsets(), &Buffer::from(vec![0, 2, 2, 2 + 5])); /// # } /// ``` +/// +/// # Generic parameter +/// The generic parameter [`Offset`] can only be `i32` or `i64` and tradeoffs maximum array length with +/// memory usage: +/// * the sum of lengths of all elements cannot exceed `Offset::MAX` +/// * the total size of the underlying data is `array.len() * size_of::() + sum of lengths of all elements` +/// /// # Safety /// The following invariants hold: /// * Two consecutives `offsets` casted (`as`) to `usize` are valid slices of `values`. @@ -57,7 +64,7 @@ pub struct Utf8Array { // constructors impl Utf8Array { - /// Returns a [`Utf8Array`] from its internal representation. + /// Returns a [`Utf8Array`] created from its internal representation. /// /// # Errors /// This function returns an error iff: @@ -115,10 +122,7 @@ impl Utf8Array { /// Returns an iterator of `Option<&str>` pub fn iter(&self) -> ZipValidity<&str, Utf8ValuesIter> { - zip_validity( - Utf8ValuesIter::new(self), - self.validity.as_ref().map(|x| x.iter()), - ) + zip_validity(self.values_iter(), self.validity.as_ref().map(|x| x.iter())) } /// Returns an iterator of `&str`