From 8e843ed5f03f79b6f06f344867e0570e8cd6579b Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Wed, 7 Dec 2022 22:46:40 +0000 Subject: [PATCH] Improved perf of take --- src/array/binary/mod.rs | 2 +- src/array/utf8/mod.rs | 2 +- src/compute/take/generic_binary.rs | 33 ++++++++++-------------------- src/offset.rs | 6 ++++++ 4 files changed, 19 insertions(+), 24 deletions(-) diff --git a/src/array/binary/mod.rs b/src/array/binary/mod.rs index a11469b220d..1a5abdcc330 100644 --- a/src/array/binary/mod.rs +++ b/src/array/binary/mod.rs @@ -40,7 +40,7 @@ pub use mutable::*; /// assert_eq!(array.values_iter().collect::>(), vec![[1, 2].as_ref(), &[], &[3]]); /// // the underlying representation: /// assert_eq!(array.values(), &Buffer::from(vec![1, 2, 3])); -/// assert_eq!(array.offsets(), &Buffer::from(vec![0, 2, 2, 3])); +/// assert_eq!(array.offsets().buffer(), &Buffer::from(vec![0, 2, 2, 3])); /// assert_eq!(array.validity(), Some(&Bitmap::from([true, false, true]))); /// ``` /// diff --git a/src/array/utf8/mod.rs b/src/array/utf8/mod.rs index 36ce27b28bf..f8b8b86a8b8 100644 --- a/src/array/utf8/mod.rs +++ b/src/array/utf8/mod.rs @@ -51,7 +51,7 @@ impl> AsRef<[u8]> for StrAsBytes { /// // the underlying representation /// assert_eq!(array.validity(), Some(&Bitmap::from([true, false, true]))); /// assert_eq!(array.values(), &Buffer::from(b"hithere".to_vec())); -/// assert_eq!(array.offsets(), &Buffer::from(vec![0, 2, 2, 2 + 5])); +/// assert_eq!(array.offsets().buffer(), &Buffer::from(vec![0, 2, 2, 2 + 5])); /// # } /// ``` /// diff --git a/src/compute/take/generic_binary.rs b/src/compute/take/generic_binary.rs index 4fc4d01138d..a9cf9c199c2 100644 --- a/src/compute/take/generic_binary.rs +++ b/src/compute/take/generic_binary.rs @@ -17,10 +17,10 @@ pub fn take_values( let mut buffer = Vec::with_capacity(new_len); starts .iter() - .zip(offsets.buffer().windows(2)) - .for_each(|(start_, window)| { - let start = start_.to_usize(); - let end = (*start_ + (window[1] - window[0])).to_usize(); + .map(|start| start.to_usize()) + .zip(offsets.lengths()) + .for_each(|(start, length)| { + let end = start + length; buffer.extend_from_slice(&values[start..end]); }); buffer.into() @@ -32,27 +32,16 @@ pub fn take_no_validity( values: &[u8], indices: &[I], ) -> (OffsetsBuffer, Buffer, Option) { - let mut length = O::zero(); let mut buffer = Vec::::new(); - let offsets = offsets.buffer(); - let offsets = indices.iter().map(|index| { - let index = index.to_usize(); - let start = offsets[index]; - let length_h = offsets[index + 1] - start; - length += length_h; - - let _start = start.to_usize(); - let end = (start + length_h).to_usize(); - buffer.extend_from_slice(&values[_start..end]); - length + let lengths = indices.iter().map(|index| index.to_usize()).map(|index| { + let (start, end) = offsets.start_end(index); + // todo: remove this bound check + buffer.extend_from_slice(&values[start..end]); + end - start }); - let offsets = std::iter::once(O::zero()) - .chain(offsets) - .collect::>(); - // Safety: offsets _are_ monotonically increasing - let offsets = unsafe { Offsets::new_unchecked(offsets) }.into(); + let offsets = Offsets::try_from_lengths(lengths).expect(""); - (offsets, buffer.into(), None) + (offsets.into(), buffer.into(), None) } // take implementation when only values contain nulls diff --git a/src/offset.rs b/src/offset.rs index 6d0878fa2b5..2337f082218 100644 --- a/src/offset.rs +++ b/src/offset.rs @@ -420,6 +420,12 @@ impl OffsetsBuffer { Self(self.0.slice_unchecked(offset, length)) } + /// Returns an iterator with the lengths of the offsets + #[inline] + pub fn lengths(&self) -> impl Iterator + '_ { + self.0.windows(2).map(|w| (w[1] - w[0]).to_usize()) + } + /// Returns the inner [`Buffer`]. #[inline] pub fn into_inner(self) -> Buffer {