diff --git a/benches/write_parquet.rs b/benches/write_parquet.rs index bfa0d6872b4..f4053c92791 100644 --- a/benches/write_parquet.rs +++ b/benches/write_parquet.rs @@ -11,7 +11,7 @@ type ChunkBox = Chunk>; fn write(array: &dyn Array, encoding: Encoding) -> Result<()> { let schema = Schema::from(vec![Field::new("c1", array.data_type().clone(), true)]); - let columns: ChunkBox = Chunk::new(vec![clone(array).into()]); + let columns: ChunkBox = Chunk::new(vec![clone(array)]); let options = WriteOptions { write_statistics: false, diff --git a/examples/cow.rs b/examples/cow.rs new file mode 100644 index 00000000000..c3ae476f91f --- /dev/null +++ b/examples/cow.rs @@ -0,0 +1,42 @@ +// This example demos how to operate on arrays in-place. +use arrow2::{ + array::{Array, PrimitiveArray}, + types::NativeType, +}; + +// this function will clone-on-write the array and apply `f` to its values +fn cow_apply(array: &mut Box, f: F) { + // 1. downcast the array to its concrete type + let array = array + .as_any_mut() + .downcast_mut::>() + .unwrap(); + + // 2. empty the mut reference and create a new array on the stack with its contents + let new_array = array.take(); + + // 3. deconstruct the array into its parts + let (dt, values, validity) = new_array.into_inner(); + + // 4. clone-on-write the values + let mut values = values.make_mut(); + + // 5. apply the function over the values + f(&mut values); + + // 6. assign the new values to the array + array.try_assign(dt, values.into(), validity).unwrap(); +} + +fn main() { + // say we have have received an array + let mut array = PrimitiveArray::from_vec(vec![1i32, 2]).boxed(); + + // we can apply a transformation to its values without allocating a new array as follows: + cow_apply(&mut array, |values: &mut [i32]| { + values.iter_mut().for_each(|x| *x *= 10) + }); + + // confirm that it gives the right result :) + assert_eq!(array.as_ref(), PrimitiveArray::from_vec(vec![10i32, 20])); +} diff --git a/guide/src/high_level.md b/guide/src/high_level.md index 8c5b93c55f0..41684c3c432 100644 --- a/guide/src/high_level.md +++ b/guide/src/high_level.md @@ -268,3 +268,16 @@ Some notes: and cloned its validity. This approach is suitable for operations whose branching off is more expensive than operating over all values. If the operation is expensive, then using `PrimitiveArray::::from_trusted_len_iter` is likely faster. + +## Clone on write semantics + +We support the mutation of arrays in-place via clone-on-write semantics. +Essentially, all data is under an `Arc`, but it can be taken via `Arc::get_mut` +and operated in place. + +Below is a complete example of how to operate on a `Box` without +extra allocations. + +```rust +{{#include ../../examples/cow.rs}} +``` diff --git a/src/array/primitive/mod.rs b/src/array/primitive/mod.rs index 9db52463c52..b2c787b1c53 100644 --- a/src/array/primitive/mod.rs +++ b/src/array/primitive/mod.rs @@ -53,6 +53,28 @@ pub struct PrimitiveArray { validity: Option, } +fn check( + data_type: &DataType, + values: &[T], + validity: &Option, +) -> Result<(), Error> { + if validity + .as_ref() + .map_or(false, |validity| validity.len() != values.len()) + { + return Err(Error::oos( + "validity mask length must match the number of values", + )); + } + + if data_type.to_physical_type() != PhysicalType::Primitive(T::PRIMITIVE) { + return Err(Error::oos( + "BooleanArray can only be initialized with a DataType whose physical type is Primitive", + )); + } + Ok(()) +} + impl PrimitiveArray { /// The canonical method to create a [`PrimitiveArray`] out of its internal components. /// # Implementation @@ -67,21 +89,7 @@ impl PrimitiveArray { values: Buffer, validity: Option, ) -> Result { - if validity - .as_ref() - .map_or(false, |validity| validity.len() != values.len()) - { - return Err(Error::oos( - "validity mask length must match the number of values", - )); - } - - if data_type.to_physical_type() != PhysicalType::Primitive(T::PRIMITIVE) { - return Err(Error::oos( - "BooleanArray can only be initialized with a DataType whose physical type is Primitive", - )); - } - + check(&data_type, &values, &validity)?; Ok(Self { data_type, values, @@ -109,14 +117,7 @@ impl PrimitiveArray { #[inline] #[must_use] pub fn to(self, data_type: DataType) -> Self { - if !data_type.to_physical_type().eq_primitive(T::PRIMITIVE) { - Err(Error::InvalidArgumentError(format!( - "Type {} does not support logical type {:?}", - std::any::type_name::(), - data_type - ))) - .unwrap() - } + check(&data_type, &self.values, &self.validity).unwrap(); Self { data_type, values: self.values, @@ -252,15 +253,37 @@ impl PrimitiveArray { arr } - /// Returns a new [`PrimitiveArray`] by taking every buffer from this one, leaving this one empty. + /// Returns a new [`PrimitiveArray`] by taking everything from this one. + #[must_use] pub fn take(&mut self) -> Self { + let mut data_type: DataType = T::PRIMITIVE.into(); + std::mem::swap(&mut self.data_type, &mut data_type); Self { - data_type: self.data_type.clone(), + data_type, values: std::mem::take(&mut self.values), validity: std::mem::take(&mut self.validity), } } + /// Tries to assign the arguments to itself. + /// + /// This function is semantically similar to [`Self::try_new`] but it can be used to populate an existing + /// Array. + /// # Errors + /// Errors iff the `data_type`'s [`PhysicalType`] is not equal to [`PhysicalType::Primitive(T::PRIMITIVE)`] + pub fn try_assign( + &mut self, + data_type: DataType, + values: Buffer, + validity: Option, + ) -> Result<(), Error> { + check(&data_type, &self.values, &self.validity)?; + self.data_type = data_type; + self.values = values; + self.validity = validity; + Ok(()) + } + /// Deconstructs this [`PrimitiveArray`] into its internal components pub fn into_inner(self) -> (DataType, Buffer, Option) { let Self {