diff --git a/src/array/binary/mod.rs b/src/array/binary/mod.rs index 2612a1d26e1..ba7f635de9f 100644 --- a/src/array/binary/mod.rs +++ b/src/array/binary/mod.rs @@ -198,7 +198,7 @@ impl BinaryArray { Self::try_new_unchecked(data_type, offsets, values, validity).unwrap() } - /// Alias for [`new_unchecked`] + /// Alias for [`Self::new_unchecked`] /// # Safety /// This function is unsafe iff: /// * the offsets are not monotonically increasing diff --git a/src/array/equal/utf8.rs b/src/array/equal/utf8.rs index 337eb043f81..3a8f0e5f012 100644 --- a/src/array/equal/utf8.rs +++ b/src/array/equal/utf8.rs @@ -1,4 +1,4 @@ -use crate::array::{Array, Offset, Utf8Array}; +use crate::array::{Offset, Utf8Array}; pub(super) fn equal(lhs: &Utf8Array, rhs: &Utf8Array) -> bool { lhs.data_type() == rhs.data_type() && lhs.len() == rhs.len() && lhs.iter().eq(rhs.iter()) diff --git a/src/array/fixed_size_binary/mod.rs b/src/array/fixed_size_binary/mod.rs index a18f3968c35..2a843842dc0 100644 --- a/src/array/fixed_size_binary/mod.rs +++ b/src/array/fixed_size_binary/mod.rs @@ -186,7 +186,7 @@ impl FixedSizeBinaryArray { .get_unchecked(i * self.size..(i + 1) * self.size) } - /// Returns a new [`FixedSizeBinary`] with a different logical type. + /// Returns a new [`FixedSizeBinaryArray`] with a different logical type. /// This is `O(1)`. /// # Panics /// Panics iff the data_type is not supported for the physical type. diff --git a/src/array/primitive/from_natural.rs b/src/array/primitive/from_natural.rs index 3d259f90604..5f131dcab1e 100644 --- a/src/array/primitive/from_natural.rs +++ b/src/array/primitive/from_natural.rs @@ -1,6 +1,6 @@ use std::iter::FromIterator; -use crate::{trusted_len::TrustedLen, types::NativeType}; +use crate::types::NativeType; use super::{MutablePrimitiveArray, PrimitiveArray}; @@ -15,59 +15,3 @@ impl>> FromIterator for P MutablePrimitiveArray::::from_iter(iter).into() } } - -impl PrimitiveArray { - /// Creates a (non-null) [`PrimitiveArray`] from an iterator of values. - /// # Implementation - /// This does not assume that the iterator has a known length. - pub fn from_values>(iter: I) -> Self { - Self::new(T::PRIMITIVE.into(), Vec::::from_iter(iter).into(), None) - } - - /// Creates a (non-null) [`PrimitiveArray`] from a slice of values. - /// # Implementation - /// This is essentially a memcopy - pub fn from_slice>(slice: P) -> Self { - Self::new( - T::PRIMITIVE.into(), - Vec::::from(slice.as_ref()).into(), - None, - ) - } - - /// Creates a (non-null) [`PrimitiveArray`] from a vector of values. - /// This does not have memcopy and is the fastest way to create a [`PrimitiveArray`]. - pub fn from_vec(array: Vec) -> Self { - Self::new(T::PRIMITIVE.into(), array.into(), None) - } -} - -impl PrimitiveArray { - /// Creates a (non-null) [`PrimitiveArray`] from a [`TrustedLen`] of values. - /// # Implementation - /// This does not assume that the iterator has a known length. - pub fn from_trusted_len_values_iter>(iter: I) -> Self { - MutablePrimitiveArray::::from_trusted_len_values_iter(iter).into() - } - - /// Creates a new [`PrimitiveArray`] from an iterator over values - /// # Safety - /// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html). - /// I.e. that `size_hint().1` correctly reports its length. - pub unsafe fn from_trusted_len_values_iter_unchecked>(iter: I) -> Self { - MutablePrimitiveArray::::from_trusted_len_values_iter_unchecked(iter).into() - } - - /// Creates a [`PrimitiveArray`] from a [`TrustedLen`] of optional values. - pub fn from_trusted_len_iter>>(iter: I) -> Self { - MutablePrimitiveArray::::from_trusted_len_iter(iter).into() - } - - /// Creates a [`PrimitiveArray`] from an iterator of optional values. - /// # Safety - /// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html). - /// I.e. that `size_hint().1` correctly reports its length. - pub unsafe fn from_trusted_len_iter_unchecked>>(iter: I) -> Self { - MutablePrimitiveArray::::from_trusted_len_iter_unchecked(iter).into() - } -} diff --git a/src/array/primitive/iterator.rs b/src/array/primitive/iterator.rs index 5d5049e34fc..42fb758c4da 100644 --- a/src/array/primitive/iterator.rs +++ b/src/array/primitive/iterator.rs @@ -16,17 +16,6 @@ impl<'a, T: NativeType> IntoIterator for &'a PrimitiveArray { } } -impl<'a, T: NativeType> PrimitiveArray { - /// constructs a new iterator - #[inline] - pub fn iter(&'a self) -> ZipValidity<'a, &'a T, std::slice::Iter<'a, T>> { - zip_validity( - self.values().iter(), - self.validity.as_ref().map(|x| x.iter()), - ) - } -} - impl<'a, T: NativeType> MutablePrimitiveArray { /// Returns an iterator over `Option` #[inline] diff --git a/src/array/primitive/mod.rs b/src/array/primitive/mod.rs index d8ec4182b67..ae753549f75 100644 --- a/src/array/primitive/mod.rs +++ b/src/array/primitive/mod.rs @@ -1,8 +1,12 @@ use crate::{ - bitmap::Bitmap, + bitmap::{ + utils::{zip_validity, ZipValidity}, + Bitmap, + }, buffer::Buffer, datatypes::*, error::Error, + trusted_len::TrustedLen, types::{days_ms, months_days_ns, NativeType}, }; @@ -17,19 +21,30 @@ pub use iterator::*; mod mutable; pub use mutable::*; -/// A [`PrimitiveArray`] is arrow's equivalent to `Vec>`, i.e. -/// an array designed for highly performant operations on optionally nullable slots, -/// backed by a physical type of a fixed byte-width, such as `i32` or `f64`. -/// The size of this struct is `O(1)` as all data is stored behind an [`std::sync::Arc`]. +/// A [`PrimitiveArray`] is Arrow's semantically equivalent of an immutable `Vec>` where +/// T is [`NativeType`] (e.g. [`i32`]). It implements [`Array`]. +/// +/// One way to think about a [`PrimitiveArray`] is `(DataType, Arc>, Option>>)` +/// where: +/// * the first item is the array's logical type +/// * the second is the immutable values +/// * the third is the immutable validity (whether a value is null or not as a bitmap). +/// +/// The size of this struct is `O(1)`, as all data is stored behind an [`std::sync::Arc`]. /// # Example /// ``` -/// use arrow2::array::{PrimitiveArray, Array}; +/// use arrow2::array::PrimitiveArray; /// use arrow2::bitmap::Bitmap; -/// # fn main() { -/// let array = PrimitiveArray::from([Some(1), None, Some(10)]); -/// assert_eq!(array.values().as_slice(), &[1, 0, 10]); +/// use arrow2::buffer::Buffer; +/// +/// let array = PrimitiveArray::from([Some(1i32), None, Some(10)]); +/// assert_eq!(array.value(0), 1); +/// assert_eq!(array.iter().collect::>(), vec![Some(&1i32), None, Some(&10)]); +/// assert_eq!(array.values_iter().copied().collect::>(), vec![1, 0, 10]); +/// // the underlying representation +/// assert_eq!(array.values(), &Buffer::from(vec![1i32, 0, 10])); /// assert_eq!(array.validity(), Some(&Bitmap::from([true, false, true]))); -/// # } +/// /// ``` #[derive(Clone)] pub struct PrimitiveArray { @@ -39,11 +54,14 @@ pub struct PrimitiveArray { } impl PrimitiveArray { - /// The canonical method to create a [`PrimitiveArray`]. + /// The canonical method to create a [`PrimitiveArray`] out of its internal components. + /// # Implementation + /// This function is `O(1)`. + /// /// # Errors /// This function errors iff: /// * The validity is not `None` and its length is different from `values`'s length - /// * The `data_type`'s [`PhysicalType`] is not equal to [`PhysicalType::Primitive`]. + /// * The `data_type`'s [`PhysicalType`] is not equal to [`PhysicalType::Primitive(T::PRIMITIVE)`] pub fn try_new( data_type: DataType, values: Buffer, @@ -71,42 +89,127 @@ impl PrimitiveArray { }) } - /// The canonical method to create a [`PrimitiveArray`] + /// Returns a new [`PrimitiveArray`] with a different logical type. + /// + /// This function is useful to assign a different [`DataType`] to the array. + /// Used to change the arrays' logical type (see example). + /// # Example + /// ``` + /// use arrow2::array::Int32Array; + /// use arrow2::datatypes::DataType; + /// + /// let array = Int32Array::from(&[Some(1), None, Some(2)]).to(DataType::Date32); + /// assert_eq!( + /// format!("{:?}", array), + /// "Date32[1970-01-02, None, 1970-01-03]" + /// ); + /// ``` /// # Panics - /// This function errors iff: - /// * The validity is not `None` and its length is different from `values`'s length - /// * The `data_type`'s [`PhysicalType`] is not equal to [`PhysicalType::Primitive`]. - pub fn new(data_type: DataType, values: Buffer, validity: Option) -> Self { - Self::try_new(data_type, values, validity).unwrap() + /// Panics iff the `data_type`'s [`PhysicalType`] is not equal to [`PhysicalType::Primitive(T::PRIMITIVE)`] + #[inline] + #[must_use] + pub fn to(self, data_type: DataType) -> Self { + if !data_type.to_physical_type().eq_primitive(T::PRIMITIVE) { + Err(Error::InvalidArgumentError(format!( + "Type {} does not support logical type {:?}", + std::any::type_name::(), + data_type + ))) + .unwrap() + } + Self { + data_type, + values: self.values, + validity: self.validity, + } } - /// Alias for `new` - pub fn from_data(data_type: DataType, values: Buffer, validity: Option) -> Self { - Self::new(data_type, values, validity) + /// Creates a (non-null) [`PrimitiveArray`] from a vector of values. + /// # Examples + /// ``` + /// use arrow2::array::PrimitiveArray; + /// + /// let array = PrimitiveArray::from_vec(vec![1, 2, 3]); + /// assert_eq!(format!("{:?}", array), "Int32[1, 2, 3]"); + /// ``` + pub fn from_vec(values: Vec) -> Self { + Self::new(T::PRIMITIVE.into(), values.into(), None) } - /// Returns a new empty [`PrimitiveArray`]. - pub fn new_empty(data_type: DataType) -> Self { - Self::new(data_type, Buffer::new(), None) + /// Returns an iterator over the values and validity, `Option<&T>`. + #[inline] + pub fn iter(&self) -> ZipValidity<&T, std::slice::Iter> { + zip_validity( + self.values().iter(), + self.validity().as_ref().map(|x| x.iter()), + ) } - /// Returns a new [`PrimitiveArray`] whose all slots are null / `None`. + /// Returns an iterator of the values, `&T`, ignoring the arrays' validity. #[inline] - pub fn new_null(data_type: DataType, length: usize) -> Self { - Self::new( - data_type, - Buffer::new_zeroed(length), - Some(Bitmap::new_zeroed(length)), - ) + pub fn values_iter(&self) -> std::slice::Iter { + self.values().iter() } -} -impl PrimitiveArray { - /// Returns a slice of this [`PrimitiveArray`]. + /// Returns the length of this array + #[inline] + pub fn len(&self) -> usize { + self.values.len() + } + + /// The values [`Buffer`]. + /// Values on null slots are undetermined (they can be anything). + #[inline] + pub fn values(&self) -> &Buffer { + &self.values + } + + /// Returns the optional validity. + #[inline] + pub fn validity(&self) -> Option<&Bitmap> { + self.validity.as_ref() + } + + /// Returns the arrays' [`DataType`]. + #[inline] + pub fn data_type(&self) -> &DataType { + &self.data_type + } + + /// Returns the value at slot `i`. + /// + /// Equivalent to `self.values()[i]`. The value of a null slot is undetermined (it can be anything). + /// # Panic + /// This function panics iff `i >= self.len`. + #[inline] + pub fn value(&self, i: usize) -> T { + self.values()[i] + } + + /// Returns the value at index `i`. + /// The value on null slots is undetermined (it can be anything). + /// # Safety + /// Caller must be sure that `i < self.len()` + #[inline] + pub unsafe fn value_unchecked(&self, i: usize) -> T { + *self.values.get_unchecked(i) + } + + /// Returns a clone of this [`PrimitiveArray`] sliced by an offset and length. /// # Implementation /// This operation is `O(1)` as it amounts to increase two ref counts. + /// # Examples + /// ``` + /// use arrow2::array::PrimitiveArray; + /// + /// let array = PrimitiveArray::from_vec(vec![1, 2, 3]); + /// assert_eq!(format!("{:?}", array), "Int32[1, 2, 3]"); + /// let sliced = array.slice(1, 1); + /// assert_eq!(format!("{:?}", sliced), "Int32[2]"); + /// // note: `sliced` and `array` share the same memory region. + /// ``` /// # Panic - /// This function panics iff `offset + length >= self.len()`. + /// This function panics iff `offset + length > self.len()`. #[inline] #[must_use] pub fn slice(&self, offset: usize, length: usize) -> Self { @@ -117,7 +220,7 @@ impl PrimitiveArray { unsafe { self.slice_unchecked(offset, length) } } - /// Returns a slice of this [`PrimitiveArray`]. + /// Returns a clone of this [`PrimitiveArray`] sliced by an offset and length. /// # Implementation /// This operation is `O(1)` as it amounts to increase two ref counts. /// # Safety @@ -136,7 +239,7 @@ impl PrimitiveArray { } } - /// Sets the validity bitmap on this [`PrimitiveArray`]. + /// Returns a clone of this [`PrimitiveArray`] with a new validity. /// # Panics /// This function panics iff `validity.len() != self.len()`. #[must_use] @@ -148,71 +251,15 @@ impl PrimitiveArray { arr.validity = validity; arr } -} - -impl PrimitiveArray { - /// Returns the length of this array - #[inline] - pub fn len(&self) -> usize { - self.values.len() - } - /// The optional validity. - #[inline] - pub fn validity(&self) -> Option<&Bitmap> { - self.validity.as_ref() - } - - /// The arrays' logical type - #[inline] - pub fn data_type(&self) -> &DataType { - &self.data_type - } - - /// The values [`Buffer`]. - /// Values on null slots are undetermined (they can be anything). - #[inline] - pub fn values(&self) -> &Buffer { - &self.values - } - - /// Returns the value at slot `i`. Equivalent to `self.values()[i]`. - /// The value on null slots is undetermined (it can be anything). - #[inline] - pub fn value(&self, i: usize) -> T { - self.values()[i] - } - - /// Returns the element at index `i` as `T`. - /// The value on null slots is undetermined (it can be anything). - /// # Safety - /// Caller must be sure that `i < self.len()` - #[inline] - pub unsafe fn value_unchecked(&self, i: usize) -> T { - *self.values.get_unchecked(i) - } - - /// Returns a new [`PrimitiveArray`] with a different logical type. - /// This is `O(1)`. - /// # Panics - /// Panics iff the data_type is not supported for the physical type. - #[inline] - pub fn to(self, data_type: DataType) -> Self { - if !data_type.to_physical_type().eq_primitive(T::PRIMITIVE) { - Err(Error::InvalidArgumentError(format!( - "Type {} does not support logical type {:?}", - std::any::type_name::(), - data_type - ))) - .unwrap() - } - Self { - data_type, - values: self.values, - validity: self.validity, - } - } - /// Try to convert this `PrimitiveArray` to a `MutablePrimitiveArray` + /// Try to convert this [`PrimitiveArray`] to a [`MutablePrimitiveArray`] via copy-on-write semantics. + /// + /// A [`PrimitiveArray`] is backed by a [`Buffer`] and [`Bitmap`] which are essentially `Arc>`. + /// This function returns a [`MutablePrimitiveArray`] (via [`std::sync::Arc::get_mut`]) iff both values + /// and validity have not been cloned / are unique references to their underlying vectors. + /// + /// This function is primarily used to re-use memory regions. + #[must_use] pub fn into_mut(self) -> Either> { use Either::*; @@ -247,6 +294,81 @@ impl PrimitiveArray { } } } + + /// Returns a new empty (zero-length) [`PrimitiveArray`]. + pub fn new_empty(data_type: DataType) -> Self { + Self::new(data_type, Buffer::new(), None) + } + + /// Returns a new [`PrimitiveArray`] where all slots are null / `None`. + #[inline] + pub fn new_null(data_type: DataType, length: usize) -> Self { + Self::new( + data_type, + Buffer::new_zeroed(length), + Some(Bitmap::new_zeroed(length)), + ) + } + + /// Creates a (non-null) [`PrimitiveArray`] from an iterator of values. + /// # Implementation + /// This does not assume that the iterator has a known length. + pub fn from_values>(iter: I) -> Self { + Self::new(T::PRIMITIVE.into(), Vec::::from_iter(iter).into(), None) + } + + /// Creates a (non-null) [`PrimitiveArray`] from a slice of values. + /// # Implementation + /// This is essentially a memcopy and is thus `O(N)` + pub fn from_slice>(slice: P) -> Self { + Self::new( + T::PRIMITIVE.into(), + Vec::::from(slice.as_ref()).into(), + None, + ) + } + + /// Creates a (non-null) [`PrimitiveArray`] from a [`TrustedLen`] of values. + /// # Implementation + /// This does not assume that the iterator has a known length. + pub fn from_trusted_len_values_iter>(iter: I) -> Self { + MutablePrimitiveArray::::from_trusted_len_values_iter(iter).into() + } + + /// Creates a new [`PrimitiveArray`] from an iterator over values + /// # Safety + /// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html). + /// I.e. that `size_hint().1` correctly reports its length. + pub unsafe fn from_trusted_len_values_iter_unchecked>(iter: I) -> Self { + MutablePrimitiveArray::::from_trusted_len_values_iter_unchecked(iter).into() + } + + /// Creates a [`PrimitiveArray`] from a [`TrustedLen`] of optional values. + pub fn from_trusted_len_iter>>(iter: I) -> Self { + MutablePrimitiveArray::::from_trusted_len_iter(iter).into() + } + + /// Creates a [`PrimitiveArray`] from an iterator of optional values. + /// # Safety + /// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html). + /// I.e. that `size_hint().1` correctly reports its length. + pub unsafe fn from_trusted_len_iter_unchecked>>(iter: I) -> Self { + MutablePrimitiveArray::::from_trusted_len_iter_unchecked(iter).into() + } + + /// Alias for `Self::try_new(..).unwrap()`. + /// # Panics + /// This function errors iff: + /// * The validity is not `None` and its length is different from `values`'s length + /// * The `data_type`'s [`PhysicalType`] is not equal to [`PhysicalType::Primitive`]. + pub fn new(data_type: DataType, values: Buffer, validity: Option) -> Self { + Self::try_new(data_type, values, validity).unwrap() + } + + /// Alias for `Self::try_new(..).unwrap()`. + pub fn from_data(data_type: DataType, values: Buffer, validity: Option) -> Self { + Self::new(data_type, values, validity) + } } impl Array for PrimitiveArray { diff --git a/src/array/utf8/from.rs b/src/array/utf8/from.rs index 9463f7ee365..1a0a0a1f7e2 100644 --- a/src/array/utf8/from.rs +++ b/src/array/utf8/from.rs @@ -1,88 +1,9 @@ use std::iter::FromIterator; use crate::array::Offset; -use crate::trusted_len::TrustedLen; use super::{MutableUtf8Array, Utf8Array}; -impl Utf8Array { - /// Creates a new [`Utf8Array`] from a slice of `&str`. - /// This is a convenience method that just calls [`Self::from_trusted_len_values_iter`]. - #[inline] - pub fn from_slice, P: AsRef<[T]>>(slice: P) -> Self { - Self::from_trusted_len_values_iter(slice.as_ref().iter()) - } - - /// Creates a new [`Utf8Array`] from a slice of `&str`. - // Note: this can't be `impl From` because Rust does not allow double `AsRef` on it. - pub fn from, P: AsRef<[Option]>>(slice: P) -> Self { - Self::from_trusted_len_iter(slice.as_ref().iter().map(|x| x.as_ref())) - } - - /// Creates a new [`Utf8Array`] from a [`TrustedLen`] of `&str`. - #[inline] - pub fn from_trusted_len_values_iter, I: TrustedLen>( - iterator: I, - ) -> Self { - MutableUtf8Array::::from_trusted_len_values_iter(iterator).into() - } - - /// Creates a new [`Utf8Array`] from a [`Iterator`] of `&str`. - pub fn from_iter_values, I: Iterator>(iterator: I) -> Self { - MutableUtf8Array::::from_iter_values(iterator).into() - } -} - -impl Utf8Array { - /// Creates a [`Utf8Array`] from an iterator of trusted length. - /// # Safety - /// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html). - /// I.e. that `size_hint().1` correctly reports its length. - #[inline] - pub unsafe fn from_trusted_len_iter_unchecked(iterator: I) -> Self - where - P: AsRef, - I: Iterator>, - { - MutableUtf8Array::::from_trusted_len_iter_unchecked(iterator).into() - } - - /// Creates a [`Utf8Array`] from an iterator of trusted length. - #[inline] - pub fn from_trusted_len_iter(iterator: I) -> Self - where - P: AsRef, - I: TrustedLen>, - { - // soundness: I is `TrustedLen` - unsafe { Self::from_trusted_len_iter_unchecked(iterator) } - } - - /// Creates a [`Utf8Array`] from an falible iterator of trusted length. - /// # Safety - /// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html). - /// I.e. that `size_hint().1` correctly reports its length. - #[inline] - pub unsafe fn try_from_trusted_len_iter_unchecked(iterator: I) -> Result - where - P: AsRef, - I: IntoIterator, E>>, - { - MutableUtf8Array::::try_from_trusted_len_iter_unchecked(iterator).map(|x| x.into()) - } - - /// Creates a [`Utf8Array`] from an fallible iterator of trusted length. - #[inline] - pub fn try_from_trusted_len_iter(iter: I) -> Result - where - P: AsRef, - I: TrustedLen, E>>, - { - // soundness: I: TrustedLen - unsafe { Self::try_from_trusted_len_iter_unchecked(iter) } - } -} - impl> FromIterator> for Utf8Array { #[inline] fn from_iter>>(iter: I) -> Self { diff --git a/src/array/utf8/iterator.rs b/src/array/utf8/iterator.rs index 2d403b3f4bb..6a725c6254b 100644 --- a/src/array/utf8/iterator.rs +++ b/src/array/utf8/iterator.rs @@ -1,4 +1,4 @@ -use crate::bitmap::utils::{zip_validity, ZipValidity}; +use crate::bitmap::utils::ZipValidity; use crate::{array::Offset, trusted_len::TrustedLen}; use super::Utf8Array; @@ -74,19 +74,4 @@ impl<'a, O: Offset> IntoIterator for &'a Utf8Array { } } -impl<'a, O: Offset> Utf8Array { - /// Returns an iterator of `Option<&str>` - pub fn iter(&'a self) -> ZipValidity<'a, &'a str, Utf8ValuesIter<'a, O>> { - zip_validity( - Utf8ValuesIter::new(self), - self.validity.as_ref().map(|x| x.iter()), - ) - } - - /// Returns an iterator of `&str` - pub fn values_iter(&'a self) -> Utf8ValuesIter<'a, O> { - Utf8ValuesIter::new(self) - } -} - unsafe impl TrustedLen for Utf8ValuesIter<'_, O> {} diff --git a/src/array/utf8/mod.rs b/src/array/utf8/mod.rs index ed032e089fb..c216ae0ab59 100644 --- a/src/array/utf8/mod.rs +++ b/src/array/utf8/mod.rs @@ -1,9 +1,14 @@ use crate::{ - bitmap::Bitmap, + bitmap::{ + utils::{zip_validity, ZipValidity}, + Bitmap, + }, buffer::Buffer, datatypes::DataType, error::{Error, Result}, + trusted_len::TrustedLen, }; + use either::Either; use super::{ @@ -19,16 +24,22 @@ mod mutable; pub use iterator::*; pub use mutable::*; -/// A [`Utf8Array`] is arrow's equivalent of an immutable `Vec>`. +/// A [`Utf8Array`] is arrow's semantic equivalent of an immutable `Vec>`. /// Cloning and slicing this struct is `O(1)`. /// # Example /// ``` +/// use arrow2::bitmap::Bitmap; +/// use arrow2::buffer::Buffer; /// use arrow2::array::Utf8Array; /// # fn main() { /// let array = Utf8Array::::from([Some("hi"), None, Some("there")]); /// assert_eq!(array.value(0), "hi"); -/// assert_eq!(array.values().as_slice(), b"hithere".as_ref()); -/// assert_eq!(array.offsets().as_slice(), &[0, 2, 2, 2 + 5]); +/// assert_eq!(array.iter().collect::>(), vec![Some("hi"), None, Some("there")]); +/// assert_eq!(array.values_iter().collect::>(), vec!["hi", "", "there"]); +/// // the underlying representation +/// assert_eq!(array.validity(), Some(&Bitmap::from([true, false, true]))); +/// assert_eq!(array.values(), &Buffer::from(b"hithere".to_vec())); +/// assert_eq!(array.offsets(), &Buffer::from(vec![0, 2, 2, 2 + 5])); /// # } /// ``` /// # Safety @@ -46,7 +57,7 @@ pub struct Utf8Array { // constructors impl Utf8Array { - /// Returns a new [`Utf8Array`]. + /// Returns a [`Utf8Array`] from its internal representation. /// /// # Errors /// This function returns an error iff: @@ -87,154 +98,89 @@ impl Utf8Array { }) } - /// Creates a new [`Utf8Array`]. - /// # Panics - /// This function panics iff: - /// * the offsets are not monotonically increasing - /// * The last offset is not equal to the values' length. - /// * the validity's length is not equal to `offsets.len() - 1`. - /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Utf8` or `LargeUtf8`. - /// * The `values` between two consecutive `offsets` are not valid utf8 - /// # Implementation - /// This function is `O(N)` - checking monotinicity and utf8 is `O(N)` - pub fn new( - data_type: DataType, - offsets: Buffer, - values: Buffer, - validity: Option, - ) -> Self { - Self::try_new(data_type, offsets, values, validity).unwrap() + /// Returns a [`Utf8Array`] from a slice of `&str`. + /// + /// A convenience method that uses [`Self::from_trusted_len_values_iter`]. + pub fn from_slice, P: AsRef<[T]>>(slice: P) -> Self { + Self::from_trusted_len_values_iter(slice.as_ref().iter()) } - /// Alias for `new` - pub fn from_data( - data_type: DataType, - offsets: Buffer, - values: Buffer, - validity: Option, - ) -> Self { - Self::new(data_type, offsets, values, validity) + /// Returns a new [`Utf8Array`] from a slice of `&str`. + /// + /// A convenience method that uses [`Self::from_trusted_len_iter`]. + // Note: this can't be `impl From` because Rust does not allow double `AsRef` on it. + pub fn from, P: AsRef<[Option]>>(slice: P) -> Self { + Self::from_trusted_len_iter(slice.as_ref().iter().map(|x| x.as_ref())) } - /// Returns a new empty [`Utf8Array`]. - #[inline] - pub fn new_empty(data_type: DataType) -> Self { - unsafe { - Self::from_data_unchecked( - data_type, - Buffer::from(vec![O::zero()]), - Buffer::new(), - None, - ) - } + /// Returns an iterator of `Option<&str>` + pub fn iter(&self) -> ZipValidity<&str, Utf8ValuesIter> { + zip_validity( + Utf8ValuesIter::new(self), + self.validity.as_ref().map(|x| x.iter()), + ) } - /// Returns a new [`Utf8Array`] whose all slots are null / `None`. + /// Returns an iterator of `&str` + pub fn values_iter(&self) -> Utf8ValuesIter { + Utf8ValuesIter::new(self) + } + + /// Returns the length of this array #[inline] - pub fn new_null(data_type: DataType, length: usize) -> Self { - Self::new( - data_type, - Buffer::new_zeroed(length + 1), - Buffer::new(), - Some(Bitmap::new_zeroed(length)), - ) + pub fn len(&self) -> usize { + self.offsets.len() - 1 } - /// Returns the default [`DataType`], `DataType::Utf8` or `DataType::LargeUtf8` - pub fn default_data_type() -> DataType { - if O::IS_LARGE { - DataType::LargeUtf8 - } else { - DataType::Utf8 - } + /// Returns the value of the element at index `i`, ignoring the array's validity. + /// # Panic + /// This function panics iff `i >= self.len`. + #[inline] + pub fn value(&self, i: usize) -> &str { + assert!(i < self.len()); + unsafe { self.value_unchecked(i) } } -} -// unsafe constructors -impl Utf8Array { - /// Creates a new [`Utf8Array`] without checking for offsets monotinicity nor utf8-validity - /// - /// # Errors - /// This function returns an error iff: - /// * The last offset is not equal to the values' length. - /// * the validity's length is not equal to `offsets.len() - 1`. - /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Utf8` or `LargeUtf8`. + /// Returns the value of the element at index `i`, ignoring the array's validity. /// # Safety - /// This function is unsound iff: - /// * the offsets are not monotonically increasing - /// * The `values` between two consecutive `offsets` are not valid utf8 - /// # Implementation - /// This function is `O(1)` - pub unsafe fn try_new_unchecked( - data_type: DataType, - offsets: Buffer, - values: Buffer, - validity: Option, - ) -> Result { - try_check_offsets_bounds(&offsets, values.len())?; + /// This function is safe iff `i < self.len`. + #[inline] + pub unsafe fn value_unchecked(&self, i: usize) -> &str { + // soundness: the invariant of the function + let start = self.offsets.get_unchecked(i).to_usize(); + let end = self.offsets.get_unchecked(i + 1).to_usize(); - if validity - .as_ref() - .map_or(false, |validity| validity.len() != offsets.len() - 1) - { - return Err(Error::oos( - "validity mask length must match the number of values", - )); - } + // soundness: the invariant of the struct + let slice = self.values.get_unchecked(start..end); - if data_type.to_physical_type() != Self::default_data_type().to_physical_type() { - return Err(Error::oos( - "BinaryArray can only be initialized with DataType::Utf8 or DataType::LargeUtf8", - )); - } + // soundness: the invariant of the struct + std::str::from_utf8_unchecked(slice) + } - Ok(Self { - data_type, - offsets, - values, - validity, - }) + /// Returns the [`DataType`] of this array. + #[inline] + pub fn data_type(&self) -> &DataType { + &self.data_type } - /// Creates a new [`Utf8Array`] without checking for offsets monotinicity. - /// - /// # Errors - /// This function returns an error iff: - /// * The last offset is not equal to the values' length. - /// * the validity's length is not equal to `offsets.len() - 1`. - /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Utf8` or `LargeUtf8`. - /// # Safety - /// This function is unsound iff: - /// * the offsets are not monotonically increasing - /// * The `values` between two consecutive `offsets` are not valid utf8 - /// # Implementation - /// This function is `O(1)` - pub unsafe fn new_unchecked( - data_type: DataType, - offsets: Buffer, - values: Buffer, - validity: Option, - ) -> Self { - Self::try_new_unchecked(data_type, offsets, values, validity).unwrap() + /// Returns the values of this [`Utf8Array`]. + #[inline] + pub fn values(&self) -> &Buffer { + &self.values } - /// Alias for [`new_unchecked`] - /// # Safety - /// This function is unsafe iff: - /// * the offsets are not monotonically increasing - /// * The `values` between two consecutive `offsets` are not valid utf8 - pub unsafe fn from_data_unchecked( - data_type: DataType, - offsets: Buffer, - values: Buffer, - validity: Option, - ) -> Self { - Self::new_unchecked(data_type, offsets, values, validity) + /// Returns the offsets of this [`Utf8Array`]. + #[inline] + pub fn offsets(&self) -> &Buffer { + &self.offsets + } + + /// The optional validity. + #[inline] + pub fn validity(&self) -> Option<&Bitmap> { + self.validity.as_ref() } -} -// must use -impl Utf8Array { /// Returns a slice of this [`Utf8Array`]. /// # Implementation /// This operation is `O(1)` as it amounts to essentially increase two ref counts. @@ -248,6 +194,7 @@ impl Utf8Array { ); unsafe { self.slice_unchecked(offset, length) } } + /// Returns a slice of this [`Utf8Array`]. /// # Implementation /// This operation is `O(1)` as it amounts to essentially increase two ref counts. @@ -269,12 +216,12 @@ impl Utf8Array { } } - /// Sets the validity bitmap on this [`Utf8Array`]. + /// Clones this [`Utf8Array`] and assigns it a new validity /// # Panic /// This function panics iff `validity.len() != self.len()`. pub fn with_validity(&self, validity: Option) -> Self { if matches!(&validity, Some(bitmap) if bitmap.len() != self.len()) { - panic!("validity should be as least as large as the array") + panic!("validity's len must be equal to the array") } let mut arr = self.clone(); arr.validity = validity; @@ -376,59 +323,214 @@ impl Utf8Array { } } } -} -// Accessors -impl Utf8Array { - /// Returns the length of this array + /// Returns a new empty [`Utf8Array`]. + /// + /// The array is guaranteed to have no elements nor validity. #[inline] - pub fn len(&self) -> usize { - self.offsets.len() - 1 + pub fn new_empty(data_type: DataType) -> Self { + unsafe { + Self::from_data_unchecked( + data_type, + Buffer::from(vec![O::zero()]), + Buffer::new(), + None, + ) + } } - /// Returns the element at index `i` as &str + /// Returns a new [`Utf8Array`] whose all slots are null / `None`. + #[inline] + pub fn new_null(data_type: DataType, length: usize) -> Self { + Self::new( + data_type, + Buffer::new_zeroed(length + 1), + Buffer::new(), + Some(Bitmap::new_zeroed(length)), + ) + } + + /// Returns a default [`DataType`] of this array, which depends on the generic parameter `O`: `DataType::Utf8` or `DataType::LargeUtf8` + pub fn default_data_type() -> DataType { + if O::IS_LARGE { + DataType::LargeUtf8 + } else { + DataType::Utf8 + } + } + + /// Creates a new [`Utf8Array`] without checking for offsets monotinicity nor utf8-validity + /// + /// # Errors + /// This function returns an error iff: + /// * The last offset is not equal to the values' length. + /// * the validity's length is not equal to `offsets.len() - 1`. + /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Utf8` or `LargeUtf8`. /// # Safety - /// This function is safe iff `i < self.len`. - pub unsafe fn value_unchecked(&self, i: usize) -> &str { - // soundness: the invariant of the function - let start = self.offsets.get_unchecked(i).to_usize(); - let end = self.offsets.get_unchecked(i + 1).to_usize(); + /// This function is unsound iff: + /// * the offsets are not monotonically increasing + /// * The `values` between two consecutive `offsets` are not valid utf8 + /// # Implementation + /// This function is `O(1)` + pub unsafe fn try_new_unchecked( + data_type: DataType, + offsets: Buffer, + values: Buffer, + validity: Option, + ) -> Result { + try_check_offsets_bounds(&offsets, values.len())?; - // soundness: the invariant of the struct - let slice = self.values.get_unchecked(start..end); + if validity + .as_ref() + .map_or(false, |validity| validity.len() != offsets.len() - 1) + { + return Err(Error::oos( + "validity mask length must match the number of values", + )); + } - // soundness: the invariant of the struct - std::str::from_utf8_unchecked(slice) + if data_type.to_physical_type() != Self::default_data_type().to_physical_type() { + return Err(Error::oos( + "BinaryArray can only be initialized with DataType::Utf8 or DataType::LargeUtf8", + )); + } + + Ok(Self { + data_type, + offsets, + values, + validity, + }) } - /// Returns the element at index `i` - pub fn value(&self, i: usize) -> &str { - let start = self.offsets[i].to_usize(); - let end = self.offsets[i + 1].to_usize(); + /// Creates a new [`Utf8Array`]. + /// # Panics + /// This function panics iff: + /// * the offsets are not monotonically increasing + /// * The last offset is not equal to the values' length. + /// * the validity's length is not equal to `offsets.len() - 1`. + /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Utf8` or `LargeUtf8`. + /// * The `values` between two consecutive `offsets` are not valid utf8 + /// # Implementation + /// This function is `O(N)` - checking monotinicity and utf8 is `O(N)` + pub fn new( + data_type: DataType, + offsets: Buffer, + values: Buffer, + validity: Option, + ) -> Self { + Self::try_new(data_type, offsets, values, validity).unwrap() + } - // soundness: the invariant of the struct - let slice = unsafe { self.values.get_unchecked(start..end) }; + /// Creates a new [`Utf8Array`] without checking for offsets monotinicity. + /// + /// # Errors + /// This function returns an error iff: + /// * The last offset is not equal to the values' length. + /// * the validity's length is not equal to `offsets.len() - 1`. + /// * The `data_type`'s [`crate::datatypes::PhysicalType`] is not equal to either `Utf8` or `LargeUtf8`. + /// # Safety + /// This function is unsound iff: + /// * the offsets are not monotonically increasing + /// * The `values` between two consecutive `offsets` are not valid utf8 + /// # Implementation + /// This function is `O(1)` + pub unsafe fn new_unchecked( + data_type: DataType, + offsets: Buffer, + values: Buffer, + validity: Option, + ) -> Self { + Self::try_new_unchecked(data_type, offsets, values, validity).unwrap() + } + + /// Returns a (non-null) [`Utf8Array`] created from a [`TrustedLen`] of `&str`. + /// # Implementation + /// This function is `O(N)` + #[inline] + pub fn from_trusted_len_values_iter, I: TrustedLen>( + iterator: I, + ) -> Self { + MutableUtf8Array::::from_trusted_len_values_iter(iterator).into() + } - // soundness: we always check for utf8 soundness on constructors. - unsafe { std::str::from_utf8_unchecked(slice) } + /// Creates a new [`Utf8Array`] from a [`Iterator`] of `&str`. + pub fn from_iter_values, I: Iterator>(iterator: I) -> Self { + MutableUtf8Array::::from_iter_values(iterator).into() } - /// The optional validity. + /// Creates a [`Utf8Array`] from an iterator of trusted length. + /// # Safety + /// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html). + /// I.e. that `size_hint().1` correctly reports its length. #[inline] - pub fn validity(&self) -> Option<&Bitmap> { - self.validity.as_ref() + pub unsafe fn from_trusted_len_iter_unchecked(iterator: I) -> Self + where + P: AsRef, + I: Iterator>, + { + MutableUtf8Array::::from_trusted_len_iter_unchecked(iterator).into() } - /// Returns the offsets of this [`Utf8Array`]. + /// Creates a [`Utf8Array`] from an iterator of trusted length. #[inline] - pub fn offsets(&self) -> &Buffer { - &self.offsets + pub fn from_trusted_len_iter(iterator: I) -> Self + where + P: AsRef, + I: TrustedLen>, + { + // soundness: I is `TrustedLen` + unsafe { Self::from_trusted_len_iter_unchecked(iterator) } } - /// Returns the values of this [`Utf8Array`]. + /// Creates a [`Utf8Array`] from an falible iterator of trusted length. + /// # Safety + /// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html). + /// I.e. that `size_hint().1` correctly reports its length. #[inline] - pub fn values(&self) -> &Buffer { - &self.values + pub unsafe fn try_from_trusted_len_iter_unchecked( + iterator: I, + ) -> std::result::Result + where + P: AsRef, + I: IntoIterator, E>>, + { + MutableUtf8Array::::try_from_trusted_len_iter_unchecked(iterator).map(|x| x.into()) + } + + /// Creates a [`Utf8Array`] from an fallible iterator of trusted length. + #[inline] + pub fn try_from_trusted_len_iter(iter: I) -> std::result::Result + where + P: AsRef, + I: TrustedLen, E>>, + { + // soundness: I: TrustedLen + unsafe { Self::try_from_trusted_len_iter_unchecked(iter) } + } + + /// Alias for `new` + pub fn from_data( + data_type: DataType, + offsets: Buffer, + values: Buffer, + validity: Option, + ) -> Self { + Self::new(data_type, offsets, values, validity) + } + + /// Alias for [`Self::new_unchecked`] + /// # Safety + /// This function is unsafe iff: + /// * the offsets are not monotonically increasing + /// * The `values` between two consecutive `offsets` are not valid utf8 + pub unsafe fn from_data_unchecked( + data_type: DataType, + offsets: Buffer, + values: Buffer, + validity: Option, + ) -> Self { + Self::new_unchecked(data_type, offsets, values, validity) } } diff --git a/src/io/ipc/write/file_async.rs b/src/io/ipc/write/file_async.rs index 5d7777df001..fdde1e33fa9 100644 --- a/src/io/ipc/write/file_async.rs +++ b/src/io/ipc/write/file_async.rs @@ -15,7 +15,7 @@ use crate::io::ipc::{IpcField, ARROW_MAGIC}; type WriteOutput = (usize, Option, Vec, Option); -/// Sink that writes array [`chunks`](Chunk) as an IPC file. +/// Sink that writes array [`chunks`](crate::chunk::Chunk) as an IPC file. /// /// The file header is automatically written before writing the first chunk, and the file footer is /// automatically written when the sink is closed. diff --git a/src/io/ipc/write/stream_async.rs b/src/io/ipc/write/stream_async.rs index 729c820667c..8b82a6597b1 100644 --- a/src/io/ipc/write/stream_async.rs +++ b/src/io/ipc/write/stream_async.rs @@ -13,7 +13,7 @@ use super::{default_ipc_fields, schema_to_bytes, Record}; use crate::datatypes::*; use crate::error::{Error, Result}; -/// A sink that writes array [`chunks`](Chunk) as an IPC stream. +/// A sink that writes array [`chunks`](crate::chunk::Chunk) as an IPC stream. /// /// The stream header is automatically written before writing the first chunk. /// diff --git a/src/types/bit_chunk.rs b/src/types/bit_chunk.rs index 030a91fcc4f..6466d7cbbb4 100644 --- a/src/types/bit_chunk.rs +++ b/src/types/bit_chunk.rs @@ -104,7 +104,7 @@ impl Iterator for BitChunkIter { unsafe impl crate::trusted_len::TrustedLen for BitChunkIter {} /// An [`Iterator`] over a [`BitChunk`] returning the index of each bit set in the chunk -/// Refer: https://lemire.me/blog/2018/03/08/iterating-over-set-bits-quickly-simd-edition/ +/// See for details /// # Example /// ``` /// use arrow2::types::BitChunkOnes;