From 6c52e50f61ac13ecb42f9076fd00ace10173184b Mon Sep 17 00:00:00 2001 From: Vasanthakumar Vijayasekaran Date: Sun, 3 Oct 2021 11:58:43 +0530 Subject: [PATCH] Added `extend`/`extend_unchecked` for `MutableBinaryArray` (#486) --- src/array/binary/mutable.rs | 222 ++++++++++++++++++++++++++----- tests/it/array/binary/mutable.rs | 35 +++++ 2 files changed, 224 insertions(+), 33 deletions(-) diff --git a/src/array/binary/mutable.rs b/src/array/binary/mutable.rs index 81f7294324f..f77109e2abf 100644 --- a/src/array/binary/mutable.rs +++ b/src/array/binary/mutable.rs @@ -273,6 +273,77 @@ impl MutableBinaryArray { unsafe { Self::try_from_trusted_len_iter_unchecked(iterator) } } + /// Extends the [`MutableBinaryArray`] from an iterator of trusted length. + /// This differs from `extend_trusted_len` which accepts iterator of optional values. + #[inline] + pub fn extend_trusted_len_values(&mut self, iterator: I) + where + P: AsRef<[u8]>, + I: TrustedLen, + { + // Safety: The iterator is `TrustedLen` + unsafe { self.extend_trusted_len_values_unchecked(iterator) } + } + + /// Extends the [`MutableBinaryArray`] from an `iterator` of values of trusted length. + /// This differs from `extend_trusted_len_unchecked` which accepts iterator of optional + /// values. + /// # Safety + /// The `iterator` must be [`TrustedLen`] + #[inline] + pub unsafe fn extend_trusted_len_values_unchecked(&mut self, iterator: I) + where + P: AsRef<[u8]>, + I: Iterator, + { + let (_, upper) = iterator.size_hint(); + let additional = upper.expect("extend_trusted_len_values requires an upper limit"); + + extend_from_trusted_len_values_iter(&mut self.offsets, &mut self.values, iterator); + + if let Some(validity) = self.validity.as_mut() { + validity.extend_constant(additional, true); + } + } + + /// Extends the [`MutableBinaryArray`] from an iterator of [`TrustedLen`] + #[inline] + pub fn extend_trusted_len(&mut self, iterator: I) + where + P: AsRef<[u8]>, + I: TrustedLen>, + { + // Safety: The iterator is `TrustedLen` + unsafe { self.extend_trusted_len_unchecked(iterator) } + } + + /// Extends the [`MutableBinaryArray`] from an iterator of [`TrustedLen`] + /// # Safety + /// The `iterator` must be [`TrustedLen`] + #[inline] + pub unsafe fn extend_trusted_len_unchecked(&mut self, iterator: I) + where + P: AsRef<[u8]>, + I: Iterator>, + { + if self.validity.is_none() { + let mut validity = MutableBitmap::new(); + validity.extend_constant(self.len(), true); + self.validity = Some(validity); + } + + extend_from_trusted_len_iter( + &mut self.offsets, + &mut self.values, + &mut self.validity.as_mut().unwrap(), + iterator, + ); + + if self.validity.as_mut().unwrap().null_count() == 0 { + self.validity = None; + } + } + /// Creates a new [`MutableBinaryArray`] from a [`Iterator`] of `&[u8]`. pub fn from_iter_values, I: Iterator>(iterator: I) -> Self { let (offsets, values) = values_iter(iterator); @@ -341,36 +412,21 @@ where let (_, upper) = iterator.size_hint(); let len = upper.expect("trusted_len_unzip requires an upper limit"); - let mut null = MutableBitmap::with_capacity(len); let mut offsets = MutableBuffer::::with_capacity(len + 1); let mut values = MutableBuffer::::new(); + let mut validity = MutableBitmap::new(); - let mut length = O::default(); - let mut dst = offsets.as_mut_ptr(); - std::ptr::write(dst, length); - dst = dst.add(1); - for item in iterator { - if let Some(item) = item { - null.push(true); - let s = item.as_ref(); - length += O::from_usize(s.len()).unwrap(); - values.extend_from_slice(s); - } else { - null.push(false); - values.extend_from_slice(b""); - }; + offsets.push_unchecked(O::default()); - std::ptr::write(dst, length); - dst = dst.add(1); - } - assert_eq!( - dst.offset_from(offsets.as_ptr()) as usize, - len + 1, - "Trusted iterator length was not accurately reported" - ); - offsets.set_len(len + 1); + extend_from_trusted_len_iter(&mut offsets, &mut values, &mut validity, iterator); - (null.into(), offsets, values) + let validity = if validity.null_count() > 0 { + Some(validity) + } else { + None + }; + + (validity, offsets, values) } /// # Safety @@ -438,26 +494,126 @@ where let mut offsets = MutableBuffer::::with_capacity(len + 1); let mut values = MutableBuffer::::new(); - let mut length = O::default(); + offsets.push_unchecked(O::default()); + + extend_from_trusted_len_values_iter(&mut offsets, &mut values, iterator); + + (offsets, values) +} + +// Populates `offsets` and `values` [`MutableBuffer`]s with information extracted +// from the incoming `iterator`. +// # Safety +// The caller must ensure the `iterator` is [`TrustedLen`] +#[inline] +unsafe fn extend_from_trusted_len_values_iter( + offsets: &mut MutableBuffer, + values: &mut MutableBuffer, + iterator: I, +) where + O: Offset, + P: AsRef<[u8]>, + I: Iterator, +{ + let (_, upper) = iterator.size_hint(); + let additional = upper.expect("extend_from_trusted_len_values_iter requires an upper limit"); + + offsets.reserve(additional); + + // Read in the last offset, will be used to increment and store + // new values later on + let mut length = *offsets.last().unwrap(); + + // Get a mutable pointer to the `offsets`, and move the pointer + // to the position, where a new value will be written let mut dst = offsets.as_mut_ptr(); - std::ptr::write(dst, length); - dst = dst.add(1); + dst = dst.add(offsets.len()); + for item in iterator { let s = item.as_ref(); + + // Calculate the new offset value length += O::from_usize(s.len()).unwrap(); + + // Push new entries for both `values` and `offsets` buffer values.extend_from_slice(s); + std::ptr::write(dst, length); + + // Move to the next position in offset buffer + dst = dst.add(1); + } + + debug_assert_eq!( + dst.offset_from(offsets.as_ptr()) as usize, + offsets.len() + additional, + "TrustedLen iterator's length was not accurately reported" + ); + // We make sure to set the new length for the `offsets` buffer + offsets.set_len(offsets.len() + additional); +} + +// Populates `offsets`, `values`, and `validity` [`MutableBuffer`]s with +// information extracted from the incoming `iterator`. +// +// # Safety +// The caller must ensure that `iterator` is [`TrustedLen`] +#[inline] +unsafe fn extend_from_trusted_len_iter( + offsets: &mut MutableBuffer, + values: &mut MutableBuffer, + validity: &mut MutableBitmap, + iterator: I, +) where + O: Offset, + P: AsRef<[u8]>, + I: Iterator>, +{ + let (_, upper) = iterator.size_hint(); + let additional = upper.expect("extend_from_trusted_len_iter requires an upper limit"); + + offsets.reserve(additional); + validity.reserve(additional); + + // Read in the last offset, will be used to increment and store + // new values later on + let mut length = *offsets.last().unwrap(); + + // Get a mutable pointer to the `offsets`, and move the pointer + // to the position, where a new value will be written + let mut dst = offsets.as_mut_ptr(); + dst = dst.add(offsets.len()); + + for item in iterator { + if let Some(item) = item { + let bytes = item.as_ref(); + + // Calculate new offset value + length += O::from_usize(bytes.len()).unwrap(); + + // Push new values for `values` and `validity` buffer + values.extend_from_slice(bytes); + validity.push_unchecked(true); + } else { + // If `None`, update only `validity` + validity.push_unchecked(false); + } + + // Push new offset or old offset depending on the `item` std::ptr::write(dst, length); + + // Move to the next position in offset buffer dst = dst.add(1); } - assert_eq!( + + debug_assert_eq!( dst.offset_from(offsets.as_ptr()) as usize, - len + 1, - "Trusted iterator length was not accurately reported" + offsets.len() + additional, + "TrustedLen iterator's length was not accurately reported" ); - offsets.set_len(len + 1); - (offsets, values) + // We make sure to set the new length for the `offsets` buffer + offsets.set_len(offsets.len() + additional); } /// Creates two [`MutableBuffer`]s from an iterator of `&[u8]`. diff --git a/tests/it/array/binary/mutable.rs b/tests/it/array/binary/mutable.rs index 6763a9d4984..bbf79431449 100644 --- a/tests/it/array/binary/mutable.rs +++ b/tests/it/array/binary/mutable.rs @@ -9,3 +9,38 @@ fn push_null() { let array: BinaryArray = array.into(); assert_eq!(array.validity(), Some(&Bitmap::from([false]))); } + +#[test] +fn extend_trusted_len_values() { + let mut array = MutableBinaryArray::::new(); + + array.extend_trusted_len_values(vec![b"first".to_vec(), b"second".to_vec()].into_iter()); + array.extend_trusted_len_values(vec![b"third".to_vec()].into_iter()); + array.extend_trusted_len(vec![None, Some(b"fourth".to_vec())].into_iter()); + + let array: BinaryArray = array.into(); + + assert_eq!(array.values().as_slice(), b"firstsecondthirdfourth"); + assert_eq!(array.offsets().as_slice(), &[0, 5, 11, 16, 16, 22]); + assert_eq!( + array.validity(), + Some(&Bitmap::from_u8_slice(&[0b00010111], 5)) + ); +} + +#[test] +fn extend_trusted_len() { + let mut array = MutableBinaryArray::::new(); + + array.extend_trusted_len(vec![Some(b"first".to_vec()), Some(b"second".to_vec())].into_iter()); + array.extend_trusted_len(vec![None, Some(b"third".to_vec())].into_iter()); + + let array: BinaryArray = array.into(); + + assert_eq!(array.values().as_slice(), b"firstsecondthird"); + assert_eq!(array.offsets().as_slice(), &[0, 5, 11, 11, 16]); + assert_eq!( + array.validity(), + Some(&Bitmap::from_u8_slice(&[0b00001011], 4)) + ); +}