Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Commit

Permalink
Add extend/extend_unchecked for MutableUtf8Array (#413)
Browse files Browse the repository at this point in the history
  • Loading branch information
VasanthakumarV committed Sep 18, 2021
1 parent aee8124 commit aa42ac2
Show file tree
Hide file tree
Showing 2 changed files with 196 additions and 37 deletions.
197 changes: 160 additions & 37 deletions src/array/utf8/mutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,75 @@ impl<O: Offset, P: AsRef<str>> FromIterator<Option<P>> for MutableUtf8Array<O> {
}

impl<O: Offset> MutableUtf8Array<O> {
/// Extends the [`MutableUtf8Array`] from an iterator of values of trusted len.
/// This differs from `extended_trusted_len` which accepts iterator of optional values.
#[inline]
pub fn extend_trusted_len_values<I, P>(&mut self, iterator: I)
where
P: AsRef<str>,
I: TrustedLen<Item = P>,
{
unsafe { self.extend_trusted_len_values_unchecked(iterator) }
}

/// Extends the [`MutableUtf8Array`] from an iterator of values of trusted len.
/// This differs from `extended_trusted_len_unchecked` which accepts iterator of optional
/// values.
/// # Safety
/// The iterator must be trusted len.
#[inline]
pub unsafe fn extend_trusted_len_values_unchecked<I, P>(&mut self, iterator: I)
where
P: AsRef<str>,
I: Iterator<Item = P>,
{
let (_, upper) = iterator.size_hint();
let additional = upper.expect("extend_trusted_len_values requires an upper limit");

extend_from_trusted_len_values_iter(&mut self.offsets, &mut self.values, iterator);

if let Some(validity) = self.validity.as_mut() {
validity.extend_constant(additional, true);
}
}

/// Extends the [`MutableUtf8Array`] from an iterator of trusted len.
#[inline]
pub fn extend_trusted_len<I, P>(&mut self, iterator: I)
where
P: AsRef<str>,
I: TrustedLen<Item = Option<P>>,
{
unsafe { self.extend_trusted_len_unchecked(iterator) }
}

/// Extends [`MutableUtf8Array`] from an iterator of trusted len.
/// #Safety
/// The iterator must be trusted len.
#[inline]
pub unsafe fn extend_trusted_len_unchecked<I, P>(&mut self, iterator: I)
where
P: AsRef<str>,
I: Iterator<Item = Option<P>>,
{
if self.validity.is_none() {
let mut validity = MutableBitmap::new();
validity.extend_constant(self.len(), true);
self.validity = Some(validity);
}

extend_from_trusted_len_iter(
&mut self.offsets,
&mut self.values,
&mut self.validity.as_mut().unwrap(),
iterator,
);

if self.validity.as_mut().unwrap().null_count() == 0 {
self.validity = None;
}
}

/// Creates a [`MutableUtf8Array`] from an iterator of trusted length.
/// # Safety
/// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html).
Expand Down Expand Up @@ -377,37 +446,13 @@ where
P: AsRef<str>,
I: Iterator<Item = Option<P>>,
{
let (_, upper) = iterator.size_hint();
let len = upper.expect("trusted_len_unzip requires an upper limit");

let mut validity = MutableBitmap::with_capacity(len);
let mut offsets = MutableBuffer::<O>::with_capacity(len + 1);
let mut offsets = MutableBuffer::<O>::with_capacity(1);
let mut values = MutableBuffer::<u8>::new();
let mut validity = MutableBitmap::new();

let mut length = O::default();
let mut dst = offsets.as_mut_ptr();
std::ptr::write(dst, length);
dst = dst.add(1);
for item in iterator {
if let Some(item) = item {
validity.push(true);
let s = item.as_ref();
length += O::from_usize(s.len()).unwrap();
values.extend_from_slice(s.as_bytes());
} else {
validity.push(false);
values.extend_from_slice(b"");
};
offsets.push_unchecked(O::default());

std::ptr::write(dst, length);
dst = dst.add(1);
}
assert_eq!(
dst.offset_from(offsets.as_ptr()) as usize,
len + 1,
"Trusted iterator length was not accurately reported"
);
offsets.set_len(len + 1);
extend_from_trusted_len_iter(&mut offsets, &mut values, &mut validity, iterator);

let validity = if validity.null_count() > 0 {
Some(validity)
Expand Down Expand Up @@ -481,33 +526,111 @@ where
O: Offset,
P: AsRef<str>,
I: Iterator<Item = P>,
{
let mut offsets = MutableBuffer::<O>::with_capacity(1 + iterator.size_hint().1.unwrap());
let mut values = MutableBuffer::<u8>::new();

offsets.push_unchecked(O::default());

extend_from_trusted_len_values_iter(&mut offsets, &mut values, iterator);

(offsets, values)
}

/// Populates `offsets` and `values` [`Buffer`] with information
/// extracted from the incoming iterator.
/// # Safety
/// The caller must ensure that `iterator` is [`TrustedLen`]
#[inline]
unsafe fn extend_from_trusted_len_values_iter<I, P, O>(
offsets: &mut MutableBuffer<O>,
values: &mut MutableBuffer<u8>,
iterator: I,
) where
O: Offset,
P: AsRef<str>,
I: Iterator<Item = P>,
{
let (_, upper) = iterator.size_hint();
let len = upper.expect("trusted_len_unzip requires an upper limit");
let additional = upper.expect("extend_from_trusted_len_iter_values requires an upper limit");

let mut offsets = MutableBuffer::<O>::with_capacity(len + 1);
let mut values = MutableBuffer::<u8>::new();
offsets.reserve(additional);

let mut length = *offsets.last().unwrap();

let mut length = O::default();
let mut dst = offsets.as_mut_ptr();
std::ptr::write(dst, length);
dst = dst.add(1);
dst = dst.add(offsets.len());

for item in iterator {
let s = item.as_ref();

length += O::from_usize(s.len()).unwrap();

values.extend_from_slice(s.as_bytes());
std::ptr::write(dst, length);

dst = dst.add(1);
}

assert_eq!(
dst.offset_from(offsets.as_ptr()) as usize,
offsets.len() + additional,
"Trusted iterator length was not accurately reported"
);

offsets.set_len(offsets.len() + additional);
}

/// Populates `offsets`, `values`, and validity [`Buffer`] with information
/// extracted from the incoming iterator.
/// # Safety
/// The caller must ensure that `iterator` is [`TrustedLen`]
#[inline]
unsafe fn extend_from_trusted_len_iter<O, I, P>(
offsets: &mut MutableBuffer<O>,
values: &mut MutableBuffer<u8>,
validity: &mut MutableBitmap,
iterator: I,
) where
O: Offset,
P: AsRef<str>,
I: Iterator<Item = Option<P>>,
{
let (_, upper) = iterator.size_hint();
let additional = upper.expect("extend_from_trusted_len_values_iter requires an upper limit");

offsets.reserve(additional);
validity.reserve(additional);

let mut length = *offsets.last().unwrap();

let mut dst = offsets.as_mut_ptr();
dst = dst.add(offsets.len());

for item in iterator {
if let Some(item) = item {
let s = item.as_ref();

length += O::from_usize(s.len()).unwrap();

values.extend_from_slice(s.as_bytes());
validity.push_unchecked(true);
} else {
validity.push_unchecked(false);
};

std::ptr::write(dst, length);

dst = dst.add(1);
}

assert_eq!(
dst.offset_from(offsets.as_ptr()) as usize,
len + 1,
offsets.len() + additional,
"Trusted iterator length was not accurately reported"
);
offsets.set_len(len + 1);

(offsets, values)
offsets.set_len(offsets.len() + additional);
}

/// Creates two [`MutableBuffer`]s from an iterator of `&str`.
Expand Down
36 changes: 36 additions & 0 deletions tests/it/array/utf8/mutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,39 @@ fn wrong_data_type() {
let values = MutableBuffer::from(b"abbb");
MutableUtf8Array::<i32>::from_data(DataType::Int8, offsets, values, None);
}

#[test]
fn test_extend_trusted_len_values() {
let mut array = MutableUtf8Array::<i32>::new();

array.extend_trusted_len_values(["hi", "there"].iter());
array.extend_trusted_len_values(["hello"].iter());
array.extend_trusted_len(vec![Some("again"), None].into_iter());

let array: Utf8Array<i32> = array.into();

assert_eq!(array.values().as_slice(), b"hitherehelloagain");
assert_eq!(array.offsets().as_slice(), &[0, 2, 7, 12, 17, 17]);
assert_eq!(
array.validity(),
&Some(Bitmap::from_u8_slice(&[0b00001111], 5))
);
}

#[test]
fn test_extend_trusted_len() {
let mut array = MutableUtf8Array::<i32>::new();

array.extend_trusted_len(vec![Some("hi"), Some("there")].into_iter());
array.extend_trusted_len(vec![None, Some("hello")].into_iter());
array.extend_trusted_len_values(["again"].iter());

let array: Utf8Array<i32> = array.into();

assert_eq!(array.values().as_slice(), b"hitherehelloagain");
assert_eq!(array.offsets().as_slice(), &[0, 2, 7, 7, 12, 17]);
assert_eq!(
array.validity(),
&Some(Bitmap::from_u8_slice(&[0b00011011], 5))
);
}

0 comments on commit aa42ac2

Please sign in to comment.