From 544d692f826110c2f24336945665f65921148430 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Wed, 12 Jan 2022 17:20:50 +0000 Subject: [PATCH] Simplified code --- src/array/binary/mutable.rs | 245 +--------------------------------- src/array/mod.rs | 2 + src/array/physical_binary.rs | 248 +++++++++++++++++++++++++++++++++++ src/array/utf8/mutable.rs | 240 +++------------------------------ 4 files changed, 266 insertions(+), 469 deletions(-) create mode 100644 src/array/physical_binary.rs diff --git a/src/array/binary/mutable.rs b/src/array/binary/mutable.rs index 2660fe4f0ab..f39fa62c818 100644 --- a/src/array/binary/mutable.rs +++ b/src/array/binary/mutable.rs @@ -9,6 +9,7 @@ use crate::{ }; use super::BinaryArray; +use crate::array::physical_binary::*; /// The Arrow's equivalent to `Vec>>`. /// Converting a [`MutableBinaryArray`] into a [`BinaryArray`] is `O(1)`. @@ -432,247 +433,3 @@ impl> TryPush> for MutableBinaryArray { Ok(()) } } - -/// Creates [`MutableBitmap`] and two [`Vec`]s from an iterator of `Option`. -/// The first buffer corresponds to a offset buffer, the second one -/// corresponds to a values buffer. -/// # Safety -/// The caller must ensure that `iterator` is `TrustedLen`. -#[inline] -unsafe fn trusted_len_unzip(iterator: I) -> (Option, Vec, Vec) -where - O: Offset, - P: AsRef<[u8]>, - I: Iterator>, -{ - let (_, upper) = iterator.size_hint(); - let len = upper.expect("trusted_len_unzip requires an upper limit"); - - let mut offsets = Vec::::with_capacity(len + 1); - let mut values = Vec::::new(); - let mut validity = MutableBitmap::new(); - - offsets.push(O::default()); - - extend_from_trusted_len_iter(&mut offsets, &mut values, &mut validity, iterator); - - let validity = if validity.null_count() > 0 { - Some(validity) - } else { - None - }; - - (validity, offsets, values) -} - -/// # Safety -/// The caller must ensure that `iterator` is `TrustedLen`. -#[inline] -#[allow(clippy::type_complexity)] -pub(crate) unsafe fn try_trusted_len_unzip( - iterator: I, -) -> std::result::Result<(Option, Vec, Vec), E> -where - O: Offset, - P: AsRef<[u8]>, - I: Iterator, E>>, -{ - let (_, upper) = iterator.size_hint(); - let len = upper.expect("trusted_len_unzip requires an upper limit"); - - let mut null = MutableBitmap::with_capacity(len); - let mut offsets = Vec::::with_capacity(len + 1); - let mut values = Vec::::new(); - - let mut length = O::default(); - let mut dst = offsets.as_mut_ptr(); - std::ptr::write(dst, length); - dst = dst.add(1); - for item in iterator { - if let Some(item) = item? { - null.push(true); - let s = item.as_ref(); - length += O::from_usize(s.len()).unwrap(); - values.extend_from_slice(s); - } else { - null.push(false); - }; - - std::ptr::write(dst, length); - dst = dst.add(1); - } - assert_eq!( - dst.offset_from(offsets.as_ptr()) as usize, - len + 1, - "Trusted iterator length was not accurately reported" - ); - offsets.set_len(len + 1); - - Ok((null.into(), offsets, values)) -} - -/// Creates two [`Buffer`]s from an iterator of `&[u8]`. -/// The first buffer corresponds to a offset buffer, the second to a values buffer. -/// # Safety -/// The caller must ensure that `iterator` is [`TrustedLen`]. -#[inline] -pub(crate) unsafe fn trusted_len_values_iter(iterator: I) -> (Vec, Vec) -where - O: Offset, - P: AsRef<[u8]>, - I: Iterator, -{ - let (_, upper) = iterator.size_hint(); - let len = upper.expect("trusted_len_unzip requires an upper limit"); - - let mut offsets = Vec::::with_capacity(len + 1); - let mut values = Vec::::new(); - - offsets.push(O::default()); - - extend_from_trusted_len_values_iter(&mut offsets, &mut values, iterator); - - (offsets, values) -} - -// Populates `offsets` and `values` [`Vec`]s with information extracted -// from the incoming `iterator`. -// # Safety -// The caller must ensure the `iterator` is [`TrustedLen`] -#[inline] -unsafe fn extend_from_trusted_len_values_iter( - offsets: &mut Vec, - values: &mut Vec, - iterator: I, -) where - O: Offset, - P: AsRef<[u8]>, - I: Iterator, -{ - let (_, upper) = iterator.size_hint(); - let additional = upper.expect("extend_from_trusted_len_values_iter requires an upper limit"); - - offsets.reserve(additional); - - // Read in the last offset, will be used to increment and store - // new values later on - let mut length = *offsets.last().unwrap(); - - // Get a mutable pointer to the `offsets`, and move the pointer - // to the position, where a new value will be written - let mut dst = offsets.as_mut_ptr(); - dst = dst.add(offsets.len()); - - for item in iterator { - let s = item.as_ref(); - - // Calculate the new offset value - length += O::from_usize(s.len()).unwrap(); - - // Push new entries for both `values` and `offsets` buffer - values.extend_from_slice(s); - std::ptr::write(dst, length); - - // Move to the next position in offset buffer - dst = dst.add(1); - } - - debug_assert_eq!( - dst.offset_from(offsets.as_ptr()) as usize, - offsets.len() + additional, - "TrustedLen iterator's length was not accurately reported" - ); - - // We make sure to set the new length for the `offsets` buffer - offsets.set_len(offsets.len() + additional); -} - -// Populates `offsets`, `values`, and `validity` [`Vec`]s with -// information extracted from the incoming `iterator`. -// -// # Safety -// The caller must ensure that `iterator` is [`TrustedLen`] -#[inline] -unsafe fn extend_from_trusted_len_iter( - offsets: &mut Vec, - values: &mut Vec, - validity: &mut MutableBitmap, - iterator: I, -) where - O: Offset, - P: AsRef<[u8]>, - I: Iterator>, -{ - let (_, upper) = iterator.size_hint(); - let additional = upper.expect("extend_from_trusted_len_iter requires an upper limit"); - - offsets.reserve(additional); - validity.reserve(additional); - - // Read in the last offset, will be used to increment and store - // new values later on - let mut length = *offsets.last().unwrap(); - - // Get a mutable pointer to the `offsets`, and move the pointer - // to the position, where a new value will be written - let mut dst = offsets.as_mut_ptr(); - dst = dst.add(offsets.len()); - - for item in iterator { - if let Some(item) = item { - let bytes = item.as_ref(); - - // Calculate new offset value - length += O::from_usize(bytes.len()).unwrap(); - - // Push new values for `values` and `validity` buffer - values.extend_from_slice(bytes); - validity.push_unchecked(true); - } else { - // If `None`, update only `validity` - validity.push_unchecked(false); - } - - // Push new offset or old offset depending on the `item` - std::ptr::write(dst, length); - - // Move to the next position in offset buffer - dst = dst.add(1); - } - - debug_assert_eq!( - dst.offset_from(offsets.as_ptr()) as usize, - offsets.len() + additional, - "TrustedLen iterator's length was not accurately reported" - ); - - // We make sure to set the new length for the `offsets` buffer - offsets.set_len(offsets.len() + additional); -} - -/// Creates two [`Vec`]s from an iterator of `&[u8]`. -/// The first buffer corresponds to a offset buffer, the second to a values buffer. -#[inline] -fn values_iter(iterator: I) -> (Vec, Vec) -where - O: Offset, - P: AsRef<[u8]>, - I: Iterator, -{ - let (lower, _) = iterator.size_hint(); - - let mut offsets = Vec::::with_capacity(lower + 1); - let mut values = Vec::::new(); - - let mut length = O::default(); - offsets.push(length); - - for item in iterator { - let s = item.as_ref(); - length += O::from_usize(s.len()).unwrap(); - values.extend_from_slice(s); - - offsets.push(length) - } - (offsets, values) -} diff --git a/src/array/mod.rs b/src/array/mod.rs index b9a83d87225..ce205f32c89 100644 --- a/src/array/mod.rs +++ b/src/array/mod.rs @@ -26,6 +26,8 @@ use crate::{ datatypes::DataType, }; +pub(self) mod physical_binary; + /// A trait representing an immutable Arrow array. Arrow arrays are trait objects /// that are infalibly downcasted to concrete types according to the [`Array::data_type`]. pub trait Array: Send + Sync { diff --git a/src/array/physical_binary.rs b/src/array/physical_binary.rs new file mode 100644 index 00000000000..eb7ef69748a --- /dev/null +++ b/src/array/physical_binary.rs @@ -0,0 +1,248 @@ +use crate::array::Offset; +use crate::bitmap::MutableBitmap; + +/// # Safety +/// The caller must ensure that `iterator` is `TrustedLen`. +#[inline] +#[allow(clippy::type_complexity)] +pub(crate) unsafe fn try_trusted_len_unzip( + iterator: I, +) -> std::result::Result<(Option, Vec, Vec), E> +where + O: Offset, + P: AsRef<[u8]>, + I: Iterator, E>>, +{ + let (_, upper) = iterator.size_hint(); + let len = upper.expect("trusted_len_unzip requires an upper limit"); + + let mut null = MutableBitmap::with_capacity(len); + let mut offsets = Vec::::with_capacity(len + 1); + let mut values = Vec::::new(); + + let mut length = O::default(); + let mut dst = offsets.as_mut_ptr(); + std::ptr::write(dst, length); + dst = dst.add(1); + for item in iterator { + if let Some(item) = item? { + null.push_unchecked(true); + let s = item.as_ref(); + length += O::from_usize(s.len()).unwrap(); + values.extend_from_slice(s); + } else { + null.push_unchecked(false); + }; + + std::ptr::write(dst, length); + dst = dst.add(1); + } + assert_eq!( + dst.offset_from(offsets.as_ptr()) as usize, + len + 1, + "Trusted iterator length was not accurately reported" + ); + offsets.set_len(len + 1); + + Ok((null.into(), offsets, values)) +} + +/// Creates [`MutableBitmap`] and two [`Vec`]s from an iterator of `Option`. +/// The first buffer corresponds to a offset buffer, the second one +/// corresponds to a values buffer. +/// # Safety +/// The caller must ensure that `iterator` is `TrustedLen`. +#[inline] +pub(crate) unsafe fn trusted_len_unzip( + iterator: I, +) -> (Option, Vec, Vec) +where + O: Offset, + P: AsRef<[u8]>, + I: Iterator>, +{ + let (_, upper) = iterator.size_hint(); + let len = upper.expect("trusted_len_unzip requires an upper limit"); + + let mut offsets = Vec::::with_capacity(len + 1); + let mut values = Vec::::new(); + let mut validity = MutableBitmap::new(); + + offsets.push(O::default()); + + extend_from_trusted_len_iter(&mut offsets, &mut values, &mut validity, iterator); + + let validity = if validity.null_count() > 0 { + Some(validity) + } else { + None + }; + + (validity, offsets, values) +} + +/// Creates two [`Buffer`]s from an iterator of `&[u8]`. +/// The first buffer corresponds to a offset buffer, the second to a values buffer. +/// # Safety +/// The caller must ensure that `iterator` is [`TrustedLen`]. +#[inline] +pub(crate) unsafe fn trusted_len_values_iter(iterator: I) -> (Vec, Vec) +where + O: Offset, + P: AsRef<[u8]>, + I: Iterator, +{ + let (_, upper) = iterator.size_hint(); + let len = upper.expect("trusted_len_unzip requires an upper limit"); + + let mut offsets = Vec::::with_capacity(len + 1); + let mut values = Vec::::new(); + + offsets.push(O::default()); + + extend_from_trusted_len_values_iter(&mut offsets, &mut values, iterator); + + (offsets, values) +} + +// Populates `offsets` and `values` [`Vec`]s with information extracted +// from the incoming `iterator`. +// # Safety +// The caller must ensure the `iterator` is [`TrustedLen`] +#[inline] +pub(crate) unsafe fn extend_from_trusted_len_values_iter( + offsets: &mut Vec, + values: &mut Vec, + iterator: I, +) where + O: Offset, + P: AsRef<[u8]>, + I: Iterator, +{ + let (_, upper) = iterator.size_hint(); + let additional = upper.expect("extend_from_trusted_len_values_iter requires an upper limit"); + + offsets.reserve(additional); + + // Read in the last offset, will be used to increment and store + // new values later on + let mut length = *offsets.last().unwrap(); + + // Get a mutable pointer to the `offsets`, and move the pointer + // to the position, where a new value will be written + let mut dst = offsets.as_mut_ptr(); + dst = dst.add(offsets.len()); + + for item in iterator { + let s = item.as_ref(); + + // Calculate the new offset value + length += O::from_usize(s.len()).unwrap(); + + // Push new entries for both `values` and `offsets` buffer + values.extend_from_slice(s); + std::ptr::write(dst, length); + + // Move to the next position in offset buffer + dst = dst.add(1); + } + + debug_assert_eq!( + dst.offset_from(offsets.as_ptr()) as usize, + offsets.len() + additional, + "TrustedLen iterator's length was not accurately reported" + ); + + // We make sure to set the new length for the `offsets` buffer + offsets.set_len(offsets.len() + additional); +} + +// Populates `offsets`, `values`, and `validity` [`Vec`]s with +// information extracted from the incoming `iterator`. +// +// # Safety +// The caller must ensure that `iterator` is [`TrustedLen`] +#[inline] +pub(crate) unsafe fn extend_from_trusted_len_iter( + offsets: &mut Vec, + values: &mut Vec, + validity: &mut MutableBitmap, + iterator: I, +) where + O: Offset, + P: AsRef<[u8]>, + I: Iterator>, +{ + let (_, upper) = iterator.size_hint(); + let additional = upper.expect("extend_from_trusted_len_iter requires an upper limit"); + + offsets.reserve(additional); + validity.reserve(additional); + + // Read in the last offset, will be used to increment and store + // new values later on + let mut length = *offsets.last().unwrap(); + + // Get a mutable pointer to the `offsets`, and move the pointer + // to the position, where a new value will be written + let mut dst = offsets.as_mut_ptr(); + dst = dst.add(offsets.len()); + + for item in iterator { + if let Some(item) = item { + let bytes = item.as_ref(); + + // Calculate new offset value + length += O::from_usize(bytes.len()).unwrap(); + + // Push new values for `values` and `validity` buffer + values.extend_from_slice(bytes); + validity.push_unchecked(true); + } else { + // If `None`, update only `validity` + validity.push_unchecked(false); + } + + // Push new offset or old offset depending on the `item` + std::ptr::write(dst, length); + + // Move to the next position in offset buffer + dst = dst.add(1); + } + + debug_assert_eq!( + dst.offset_from(offsets.as_ptr()) as usize, + offsets.len() + additional, + "TrustedLen iterator's length was not accurately reported" + ); + + // We make sure to set the new length for the `offsets` buffer + offsets.set_len(offsets.len() + additional); +} + +/// Creates two [`Vec`]s from an iterator of `&[u8]`. +/// The first buffer corresponds to a offset buffer, the second to a values buffer. +#[inline] +pub(crate) fn values_iter(iterator: I) -> (Vec, Vec) +where + O: Offset, + P: AsRef<[u8]>, + I: Iterator, +{ + let (lower, _) = iterator.size_hint(); + + let mut offsets = Vec::::with_capacity(lower + 1); + let mut values = Vec::::new(); + + let mut length = O::default(); + offsets.push(length); + + for item in iterator { + let s = item.as_ref(); + length += O::from_usize(s.len()).unwrap(); + values.extend_from_slice(s); + + offsets.push(length) + } + (offsets, values) +} diff --git a/src/array/utf8/mutable.rs b/src/array/utf8/mutable.rs index c4eb85527d6..cc8b170855e 100644 --- a/src/array/utf8/mutable.rs +++ b/src/array/utf8/mutable.rs @@ -12,6 +12,15 @@ use crate::{ }; use super::Utf8Array; +use crate::array::physical_binary::*; + +struct Wrapper

(P); +impl> AsRef<[u8]> for Wrapper { + #[inline] + fn as_ref(&self) -> &[u8] { + self.0.as_ref().as_bytes() + } +} /// The mutable version of [`Utf8Array`]. See [`MutableArray`] for more details. #[derive(Debug)] @@ -283,6 +292,7 @@ impl MutableUtf8Array { let (_, upper) = iterator.size_hint(); let additional = upper.expect("extend_trusted_len_values requires an upper limit"); + let iterator = iterator.map(Wrapper); extend_from_trusted_len_values_iter(&mut self.offsets, &mut self.values, iterator); if let Some(validity) = self.validity.as_mut() { @@ -315,6 +325,7 @@ impl MutableUtf8Array { self.validity = Some(validity); } + let iterator = iterator.map(|x| x.map(Wrapper)); extend_from_trusted_len_iter( &mut self.offsets, &mut self.values, @@ -337,6 +348,7 @@ impl MutableUtf8Array { P: AsRef, I: Iterator>, { + let iterator = iterator.map(|x| x.map(Wrapper)); let (validity, offsets, values) = trusted_len_unzip(iterator); // soundness: P is `str` @@ -362,6 +374,7 @@ impl MutableUtf8Array { pub unsafe fn from_trusted_len_values_iter_unchecked, I: Iterator>( iterator: I, ) -> Self { + let iterator = iterator.map(Wrapper); let (offsets, values) = unsafe { trusted_len_values_iter(iterator) }; // soundness: T is AsRef Self::from_data_unchecked(Self::default_data_type(), offsets, values, None) @@ -404,7 +417,7 @@ impl MutableUtf8Array { { let iterator = iterator.into_iter(); - // soundness: assumed trusted len + let iterator = iterator.map(|x| x.map(|x| x.map(Wrapper))); let (validity, offsets, values) = try_trusted_len_unzip(iterator)?; // soundness: P is `str` @@ -429,6 +442,7 @@ impl MutableUtf8Array { /// Creates a new [`MutableUtf8Array`] from a [`Iterator`] of `&str`. pub fn from_iter_values, I: Iterator>(iterator: I) -> Self { + let iterator = iterator.map(Wrapper); let (offsets, values) = values_iter(iterator); // soundness: T: AsRef unsafe { Self::from_data_unchecked(Self::default_data_type(), offsets, values, None) } @@ -476,227 +490,3 @@ impl> TryPush> for MutableUtf8Array { Ok(()) } } - -/// Creates [`MutableBitmap`] and two [`Vec`]s from an iterator of `Option`. -/// The first buffer corresponds to a offset buffer, the second one -/// corresponds to a values buffer. -/// # Safety -/// The caller must ensure that `iterator` is `TrustedLen`. -#[inline] -unsafe fn trusted_len_unzip(iterator: I) -> (Option, Vec, Vec) -where - O: Offset, - P: AsRef, - I: Iterator>, -{ - let mut offsets = Vec::::with_capacity(1); - let mut values = Vec::::new(); - let mut validity = MutableBitmap::new(); - - offsets.push(O::default()); - - extend_from_trusted_len_iter(&mut offsets, &mut values, &mut validity, iterator); - - let validity = if validity.null_count() > 0 { - Some(validity) - } else { - None - }; - - (validity, offsets, values) -} - -/// # Safety -/// The caller must ensure that `iterator` is `TrustedLen`. -#[inline] -#[allow(clippy::type_complexity)] -pub(crate) unsafe fn try_trusted_len_unzip( - iterator: I, -) -> std::result::Result<(Option, Vec, Vec), E> -where - O: Offset, - P: AsRef, - I: Iterator, E>>, -{ - let (_, upper) = iterator.size_hint(); - let len = upper.expect("trusted_len_unzip requires an upper limit"); - - let mut validity = MutableBitmap::with_capacity(len); - let mut offsets = Vec::::with_capacity(len + 1); - let mut values = Vec::::new(); - - let mut length = O::default(); - let mut dst = offsets.as_mut_ptr(); - std::ptr::write(dst, length); - dst = dst.add(1); - for item in iterator { - if let Some(item) = item? { - validity.push(true); - let s = item.as_ref(); - length += O::from_usize(s.len()).unwrap(); - values.extend_from_slice(s.as_bytes()); - } else { - validity.push(false); - }; - std::ptr::write(dst, length); - dst = dst.add(1); - } - assert_eq!( - dst.offset_from(offsets.as_ptr()) as usize, - len + 1, - "Trusted iterator length was not accurately reported" - ); - offsets.set_len(len + 1); - - let validity = if validity.null_count() > 0 { - Some(validity) - } else { - None - }; - - Ok((validity, offsets, values)) -} - -/// Creates two [`Buffer`]s from an iterator of `&str`. -/// The first buffer corresponds to a offset buffer, the second to a values buffer. -/// # Safety -/// The caller must ensure that `iterator` is [`TrustedLen`]. -#[inline] -pub(crate) unsafe fn trusted_len_values_iter(iterator: I) -> (Vec, Vec) -where - O: Offset, - P: AsRef, - I: Iterator, -{ - let mut offsets = Vec::::with_capacity(1 + iterator.size_hint().1.unwrap()); - let mut values = Vec::::new(); - - offsets.push(O::default()); - - extend_from_trusted_len_values_iter(&mut offsets, &mut values, iterator); - - (offsets, values) -} - -/// Populates `offsets` and `values` [`Buffer`] with information -/// extracted from the incoming iterator. -/// # Safety -/// The caller must ensure that `iterator` is [`TrustedLen`] -#[inline] -unsafe fn extend_from_trusted_len_values_iter( - offsets: &mut Vec, - values: &mut Vec, - iterator: I, -) where - O: Offset, - P: AsRef, - I: Iterator, -{ - let (_, upper) = iterator.size_hint(); - let additional = upper.expect("extend_from_trusted_len_iter_values requires an upper limit"); - - offsets.reserve(additional); - - let mut length = *offsets.last().unwrap(); - - let mut dst = offsets.as_mut_ptr(); - dst = dst.add(offsets.len()); - - for item in iterator { - let s = item.as_ref(); - - length += O::from_usize(s.len()).unwrap(); - - values.extend_from_slice(s.as_bytes()); - std::ptr::write(dst, length); - - dst = dst.add(1); - } - - assert_eq!( - dst.offset_from(offsets.as_ptr()) as usize, - offsets.len() + additional, - "Trusted iterator length was not accurately reported" - ); - - offsets.set_len(offsets.len() + additional); -} - -/// Populates `offsets`, `values`, and validity [`Buffer`] with information -/// extracted from the incoming iterator. -/// # Safety -/// The caller must ensure that `iterator` is [`TrustedLen`] -#[inline] -unsafe fn extend_from_trusted_len_iter( - offsets: &mut Vec, - values: &mut Vec, - validity: &mut MutableBitmap, - iterator: I, -) where - O: Offset, - P: AsRef, - I: Iterator>, -{ - let (_, upper) = iterator.size_hint(); - let additional = upper.expect("extend_from_trusted_len_values_iter requires an upper limit"); - - offsets.reserve(additional); - validity.reserve(additional); - - let mut length = *offsets.last().unwrap(); - - let mut dst = offsets.as_mut_ptr(); - dst = dst.add(offsets.len()); - - for item in iterator { - if let Some(item) = item { - let s = item.as_ref(); - - length += O::from_usize(s.len()).unwrap(); - - values.extend_from_slice(s.as_bytes()); - validity.push_unchecked(true); - } else { - validity.push_unchecked(false); - }; - - std::ptr::write(dst, length); - - dst = dst.add(1); - } - - assert_eq!( - dst.offset_from(offsets.as_ptr()) as usize, - offsets.len() + additional, - "Trusted iterator length was not accurately reported" - ); - - offsets.set_len(offsets.len() + additional); -} - -/// Creates two [`Vec`]s from an iterator of `&str`. -/// The first buffer corresponds to a offset buffer, the second to a values buffer. -#[inline] -fn values_iter(iterator: I) -> (Vec, Vec) -where - O: Offset, - P: AsRef, - I: Iterator, -{ - let (lower, _) = iterator.size_hint(); - - let mut offsets = Vec::::with_capacity(lower + 1); - let mut values = Vec::::new(); - - let mut length = O::default(); - offsets.push(length); - - for item in iterator { - let s = item.as_ref(); - length += O::from_usize(s.len()).unwrap(); - values.extend_from_slice(s.as_bytes()); - - offsets.push(length) - } - (offsets, values) -}