From 75c7bfbdec968bff065d2b7642ede564e42d5b48 Mon Sep 17 00:00:00 2001 From: "Jorge C. Leitao" Date: Sun, 19 Jun 2022 08:20:27 +0000 Subject: [PATCH] Improved assign --- benches/assign_ops.rs | 7 ++- examples/cow.rs | 8 ++-- src/array/boolean/mod.rs | 19 -------- src/array/primitive/mod.rs | 55 +++++------------------ src/array/primitive/mutable.rs | 20 +-------- src/buffer/immutable.rs | 26 +++++++---- src/compute/arity_assign.rs | 77 +++++++++++++++------------------ tests/it/array/boolean/mod.rs | 24 ---------- tests/it/array/primitive/mod.rs | 22 ---------- 9 files changed, 72 insertions(+), 186 deletions(-) diff --git a/benches/assign_ops.rs b/benches/assign_ops.rs index dc4c9b182f3..7b169ec42df 100644 --- a/benches/assign_ops.rs +++ b/benches/assign_ops.rs @@ -1,6 +1,6 @@ use criterion::{criterion_group, criterion_main, Criterion}; -use arrow2::compute::arity_assign::binary; +use arrow2::compute::arity_assign::{binary, unary}; use arrow2::{ compute::arithmetics::basic::{mul, mul_scalar}, util::bench_util::*, @@ -13,8 +13,7 @@ fn add_benchmark(c: &mut Criterion) { let mut arr_a = create_primitive_array::(size, 0.2); c.bench_function(&format!("apply_mul 2^{}", log2_size), |b| { b.iter(|| { - criterion::black_box(&mut arr_a) - .apply_values_mut(|x| x.iter_mut().for_each(|x| *x *= 1.01)); + unary(criterion::black_box(&mut arr_a), |x| x * 1.01); assert!(!arr_a.value(10).is_nan()); }) }); @@ -30,7 +29,7 @@ fn add_benchmark(c: &mut Criterion) { let mut arr_a = create_primitive_array::(size, 0.2); let mut arr_b = create_primitive_array_with_seed::(size, 0.2, 10); // convert to be close to 1.01 - arr_b.apply_values_mut(|x| x.iter_mut().for_each(|x| *x = 1.01 + *x / 20.0)); + unary(&mut arr_b, |x| 1.01 + x / 20.0); c.bench_function(&format!("apply_mul null 2^{}", log2_size), |b| { b.iter(|| { diff --git a/examples/cow.rs b/examples/cow.rs index 3255d365f4d..5548f453a16 100644 --- a/examples/cow.rs +++ b/examples/cow.rs @@ -1,19 +1,21 @@ // This example demos how to operate on arrays in-place. use arrow2::array::{Array, PrimitiveArray}; +use arrow2::compute::arity_assign; fn main() { - // say we have have received an array + // say we have have received an `Array` let mut array: Box = PrimitiveArray::from_vec(vec![1i32, 2]).boxed(); // we can apply a transformation to its values without allocating a new array as follows: + // 1. downcast it to the correct type (known via `array.data_type().to_physical_type()`) let array = array .as_any_mut() .downcast_mut::>() .unwrap(); - // 2. call `apply_values` with the function to apply over the values - array.apply_values_mut(|x| x.iter_mut().for_each(|x| *x *= 10)); + // 2. call `unary` with the function to apply to each value + arity_assign::unary(array, |x| x * 10); // confirm that it gives the right result :) assert_eq!(array, &PrimitiveArray::from_vec(vec![10i32, 20])); diff --git a/src/array/boolean/mod.rs b/src/array/boolean/mod.rs index 85ad172be0b..9901386cc4a 100644 --- a/src/array/boolean/mod.rs +++ b/src/array/boolean/mod.rs @@ -236,25 +236,6 @@ impl BooleanArray { self.values = values.into(); } - /// Applies a function `f` to the validity of this array, cloning it - /// iff it is being shared. - /// - /// This is an API to leverage clone-on-write - /// # Implementation - /// This function is `O(f)` if the data is not being shared, and `O(N) + O(f)` - /// if it is being shared (since it results in a `O(N)` memcopy). - /// # Panics - /// This function panics if the function modifies the length of the [`MutableBitmap`]. - pub fn apply_validity_mut(&mut self, f: F) { - if let Some(validity) = self.validity.as_mut() { - let owned_validity = std::mem::take(validity); - let mut mut_bitmap = owned_validity.make_mut(); - f(&mut mut_bitmap); - assert_eq!(mut_bitmap.len(), self.values.len()); - *validity = mut_bitmap.into(); - } - } - /// Try to convert this [`BooleanArray`] to a [`MutableBooleanArray`] pub fn into_mut(self) -> Either { use Either::*; diff --git a/src/array/primitive/mod.rs b/src/array/primitive/mod.rs index 925b9a26fc8..f358b12dbab 100644 --- a/src/array/primitive/mod.rs +++ b/src/array/primitive/mod.rs @@ -1,7 +1,7 @@ use crate::{ bitmap::{ utils::{zip_validity, ZipValidity}, - Bitmap, MutableBitmap, + Bitmap, }, buffer::Buffer, datatypes::*, @@ -283,59 +283,24 @@ impl PrimitiveArray { self.values = values; } - /// Applies a function `f` to the values of this array, cloning the values - /// iff they are being shared with others - /// - /// This is an API to use clone-on-write - /// # Implementation - /// This function is `O(f)` if the data is not being shared, and `O(N) + O(f)` - /// if it is being shared (since it results in a `O(N)` memcopy). - /// # Panics - /// This function panics, if `f` modifies the length of `&mut [T]` - pub fn apply_values_mut(&mut self, f: F) { - let values = std::mem::take(&mut self.values); - let mut values = values.make_mut(); - let len = values.len(); - f(&mut values); - assert_eq!(values.len(), len, "values length must remain the same"); - self.values = values.into(); - } - - /// Applies a function `f` to the validity of this array, cloning it - /// iff it is being shared. - /// - /// This is an API to leverage clone-on-write - /// # Implementation - /// This function is `O(f)` if the data is not being shared, and `O(N) + O(f)` - /// if it is being shared (since it results in a `O(N)` memcopy). - /// # Panics - /// This function panics if the function modifies the length of the [`MutableBitmap`]. - pub fn apply_validity_mut(&mut self, f: F) { - if let Some(validity) = self.validity.as_mut() { - let owned_validity = std::mem::take(validity); - let mut mut_bitmap = owned_validity.make_mut(); - f(&mut mut_bitmap); - assert_eq!(mut_bitmap.len(), self.values.len()); - *validity = mut_bitmap.into(); - } - } - /// Applies a function `f` to the validity of this array, the caller can decide to make /// it mutable or not. /// /// This is an API to leverage clone-on-write - /// # Implementation - /// This function is `O(f)` if the data is not being shared, and `O(N) + O(f)` - /// if it is being shared (since it results in a `O(N)` memcopy). /// # Panics - /// This function panics if the function modifies the length of the [`MutableBitmap`]. - pub fn apply_validity(&mut self, f: F) { - if let Some(validity) = self.validity.as_mut() { - f(validity); + /// This function panics if the function `f` modifies the length of the [`Bitmap`]. + pub fn apply_validity Bitmap>(&mut self, f: F) { + if let Some(validity) = std::mem::take(&mut self.validity) { assert_eq!(validity.len(), self.values.len()); + self.validity = Some(f(validity)) } } + /// Returns an option of a mutable reference to the values of this [`PrimitiveArray`]. + pub fn get_mut_values(&mut self) -> Option<&mut [T]> { + self.values.get_mut().map(|x| x.as_mut()) + } + /// Try to convert this [`PrimitiveArray`] to a [`MutablePrimitiveArray`] via copy-on-write semantics. /// /// A [`PrimitiveArray`] is backed by a [`Buffer`] and [`Bitmap`] which are essentially `Arc>`. diff --git a/src/array/primitive/mutable.rs b/src/array/primitive/mutable.rs index ae983c10af0..573fa8953a3 100644 --- a/src/array/primitive/mutable.rs +++ b/src/array/primitive/mutable.rs @@ -90,27 +90,9 @@ impl MutablePrimitiveArray { /// This function is `O(f)` if the data is not being shared, and `O(N) + O(f)` /// if it is being shared (since it results in a `O(N)` memcopy). /// # Panics - /// This function panics, if `f` modifies the length of `&mut [T]` + /// This function panics iff `f` panics pub fn apply_values(&mut self, f: F) { - let len = self.values.len(); f(&mut self.values); - assert_eq!(len, self.values.len(), "values length must remain the same") - } - - /// Applies a function `f` to the validity of this array, cloning it - /// iff it is being shared. - /// - /// This is an API to leverage clone-on-write - /// # Implementation - /// This function is `O(f)` if the data is not being shared, and `O(N) + O(f)` - /// if it is being shared (since it results in a `O(N)` memcopy). - /// # Panics - /// This function panics if the function modifies the length of the [`MutableBitmap`]. - pub fn apply_validity(&mut self, f: F) { - if let Some(validity) = &mut self.validity { - f(validity); - assert_eq!(validity.len(), self.values.len()); - } } } diff --git a/src/buffer/immutable.rs b/src/buffer/immutable.rs index 614089a9052..c4e73ce33d9 100644 --- a/src/buffer/immutable.rs +++ b/src/buffer/immutable.rs @@ -137,6 +137,20 @@ impl Buffer { self.offset } + /// Gets a mutable reference to its underlying [`Vec`], if it not being shared. + /// + /// This operation returns a [`Vec`] iff this [`Buffer`]: + /// * is not an offsetted slice of another [`Buffer`] + /// * has not been cloned (i.e. [`Arc`]`::get_mut` yields [`Some`]) + /// * has not been imported from the c data interface (FFI) + pub fn get_mut(&mut self) -> Option<&mut Vec> { + if self.offset != 0 { + None + } else { + Arc::get_mut(&mut self.data).and_then(|b| b.get_vec()) + } + } + /// Converts this [`Buffer`] to either a [`Buffer`] or a [`Vec`], returning itself if the conversion /// is not possible /// @@ -145,16 +159,10 @@ impl Buffer { /// * has not been cloned (i.e. [`Arc`]`::get_mut` yields [`Some`]) /// * has not been imported from the c data interface (FFI) pub fn into_mut(mut self) -> Either> { - if self.offset != 0 { - Either::Left(self) + if let Some(vec) = self.get_mut() { + Either::Right(std::mem::take(vec)) } else { - match Arc::get_mut(&mut self.data).and_then(|b| b.get_vec()) { - Some(v) => { - let data = std::mem::take(v); - Either::Right(data) - } - None => Either::Left(self), - } + Either::Left(self) } } diff --git a/src/compute/arity_assign.rs b/src/compute/arity_assign.rs index ff504c9690f..79d8f579b30 100644 --- a/src/compute/arity_assign.rs +++ b/src/compute/arity_assign.rs @@ -4,10 +4,12 @@ use super::utils::check_same_len; use crate::{array::PrimitiveArray, types::NativeType}; use either::Either; -/// Applies an unary function to a [`PrimitiveArray`] in-place via cow semantics. +/// Applies an unary function to a [`PrimitiveArray`], optionally in-place. /// /// # Implementation -/// This is the fastest method to apply a binary operation and it is often vectorized (SIMD). +/// This function tries to apply the function directly to the values of the array. +/// If that region is shared, this function creates a new region and writes to it. +/// /// # Panics /// This function panics iff /// * the arrays have a different length. @@ -18,14 +20,22 @@ where I: NativeType, F: Fn(I) -> I, { - array.apply_values_mut(|values| values.iter_mut().for_each(|v| *v = op(*v))); + if let Some(values) = array.get_mut_values() { + // mutate in place + values.iter_mut().for_each(|l| *l = op(*l)); + } else { + // alloc and write to new region + let values = array.values().iter().map(|l| op(*l)).collect::>(); + array.set_values(values.into()); + } } -/// Applies a binary operations to two [`PrimitiveArray`], applying the operation -/// in-place to the `lhs` via cow semantics. +/// Applies a binary function to two [`PrimitiveArray`]s, optionally in-place, returning +/// a new [`PrimitiveArray`]. /// /// # Implementation -/// This is the fastest way to perform a binary operation and it is often vectorized (SIMD). +/// This function tries to apply the function directly to the values of the array. +/// If that region is shared, this function creates a new region and writes to it. /// # Panics /// This function panics iff /// * the arrays have a different length. @@ -48,21 +58,16 @@ where // bitmap has an offset. if let Some(rhs) = rhs.validity() { if lhs.validity().is_none() { - *lhs = lhs.with_validity(Some(rhs.clone())) + lhs.set_validity(Some(rhs.clone())); } else { lhs.apply_validity(|bitmap| { - // we need to take ownership for the `into_mut` call, but leave the `&mut` lhs intact - // so that we can later assign the result to out `&mut bitmap` - let owned_lhs = std::mem::take(bitmap); - - *bitmap = match owned_lhs.into_mut() { - // we take alloc and write to new buffer + match bitmap.into_mut() { Either::Left(immutable) => { - // we allocate a new bitmap + // alloc new region &immutable & rhs } - // we can mutate in place, happy days. Either::Right(mut mutable) => { + // mutate in place let mut mutable_ref = &mut mutable; mutable_ref &= rhs; mutable.into() @@ -70,32 +75,22 @@ where } }); } - } + }; - // we need to take ownership for the `into_mut` call, but leave the `&mut` lhs intact - // so that we can later assign the result to out `&mut lhs` - let owned_lhs = std::mem::take(lhs); - - *lhs = match owned_lhs.into_mut() { - // we take alloc and write to new buffer - Either::Left(mut immutable) => { - let values = immutable - .values() - .iter() - .zip(rhs.values().iter()) - .map(|(l, r)| op(*l, *r)) - .collect::>(); - immutable.set_values(values.into()); - immutable - } - // we can mutate in place - Either::Right(mut mutable) => { - mutable.apply_values(|x| { - x.iter_mut() - .zip(rhs.values().iter()) - .for_each(|(l, r)| *l = op(*l, *r)) - }); - mutable.into() - } + if let Some(values) = lhs.get_mut_values() { + // mutate values in place + values + .iter_mut() + .zip(rhs.values().iter()) + .for_each(|(l, r)| *l = op(*l, *r)); + } else { + // alloc new region + let values = lhs + .values() + .iter() + .zip(rhs.values().iter()) + .map(|(l, r)| op(*l, *r)) + .collect::>(); + lhs.set_values(values.into()); } } diff --git a/tests/it/array/boolean/mod.rs b/tests/it/array/boolean/mod.rs index ffd5867bbb1..2dd47d480d9 100644 --- a/tests/it/array/boolean/mod.rs +++ b/tests/it/array/boolean/mod.rs @@ -131,27 +131,3 @@ fn from_iter() { let a: BooleanArray = iter.collect(); assert_eq!(a.len(), 2); } - -#[test] -fn apply_values() { - let mut a = BooleanArray::from([Some(true), Some(false), None]); - a.apply_values_mut(|x| { - let mut a = std::mem::take(x); - a = !a; - *x = a; - }); - let expected = BooleanArray::from([Some(false), Some(true), None]); - assert_eq!(a, expected); -} - -#[test] -fn apply_validity() { - let mut a = BooleanArray::from([Some(true), Some(false), None]); - a.apply_validity_mut(|x| { - let mut a = std::mem::take(x); - a = !a; - *x = a; - }); - let expected = BooleanArray::from([None, None, Some(false)]); - assert_eq!(a, expected); -} diff --git a/tests/it/array/primitive/mod.rs b/tests/it/array/primitive/mod.rs index bb0402f4006..058d503f479 100644 --- a/tests/it/array/primitive/mod.rs +++ b/tests/it/array/primitive/mod.rs @@ -124,25 +124,3 @@ fn into_mut_3() { let array = PrimitiveArray::new(DataType::Int32, values, validity); assert!(array.into_mut().is_right()); } - -#[test] -fn apply_values() { - let mut a = PrimitiveArray::from([Some(1), Some(2), None]); - a.apply_values_mut(|x| { - x[0] = 10; - }); - let expected = PrimitiveArray::from([Some(10), Some(2), None]); - assert_eq!(a, expected); -} - -#[test] -fn apply_validity() { - let mut a = PrimitiveArray::from([Some(1), Some(2), None]); - a.apply_validity_mut(|x| { - let mut a = std::mem::take(x); - a = !a; - *x = a; - }); - let expected = PrimitiveArray::from([None, None, Some(0)]); - assert_eq!(a, expected); -}