diff --git a/Cargo.toml b/Cargo.toml index 07fe03ca132..1359736e7e3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -181,8 +181,7 @@ compute_substring = [] compute_take = [] compute_temporal = [] compute_window = ["compute_concatenate"] -compute_lower = [] -compute_upper = [] +compute_utf8 = [] compute = [ "compute_aggregate", "compute_arithmetics", @@ -207,9 +206,8 @@ compute = [ "compute_substring", "compute_take", "compute_temporal", - "compute_window", - "compute_lower", - "compute_upper" + "compute_utf8", + "compute_window" ] benchmarks = ["rand"] simd = ["packed_simd"] diff --git a/src/compute/lower.rs b/src/compute/lower.rs deleted file mode 100644 index 5a9978179b4..00000000000 --- a/src/compute/lower.rs +++ /dev/null @@ -1,67 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Defines kernel to extract a lower case of a \[Large\]StringArray - -use super::utils::utf8_apply; -use crate::array::*; -use crate::{ - datatypes::DataType, - error::{ArrowError, Result}, -}; - -/// Returns a new `Array` where each of each of the elements is lower-cased. -/// this function errors when the passed array is not a \[Large\]String array. -pub fn lower(array: &dyn Array) -> Result> { - match array.data_type() { - DataType::LargeUtf8 => Ok(Box::new(utf8_apply( - str::to_lowercase, - array - .as_any() - .downcast_ref::>() - .expect("A large string is expected"), - ))), - DataType::Utf8 => Ok(Box::new(utf8_apply( - str::to_lowercase, - array - .as_any() - .downcast_ref::>() - .expect("A string is expected"), - ))), - _ => Err(ArrowError::InvalidArgumentError(format!( - "lower does not support type {:?}", - array.data_type() - ))), - } -} - -/// Checks if an array of type `datatype` can perform lower operation -/// -/// # Examples -/// ``` -/// use arrow2::compute::lower::can_lower; -/// use arrow2::datatypes::{DataType}; -/// -/// let data_type = DataType::Utf8; -/// assert_eq!(can_lower(&data_type), true); -/// -/// let data_type = DataType::Null; -/// assert_eq!(can_lower(&data_type), false); -/// ``` -pub fn can_lower(data_type: &DataType) -> bool { - matches!(data_type, DataType::LargeUtf8 | DataType::Utf8) -} diff --git a/src/compute/mod.rs b/src/compute/mod.rs index 7667b320e53..801abd2bff1 100644 --- a/src/compute/mod.rs +++ b/src/compute/mod.rs @@ -57,9 +57,6 @@ pub mod like; #[cfg(feature = "compute_limit")] #[cfg_attr(docsrs, doc(cfg(feature = "compute_limit")))] pub mod limit; -#[cfg(feature = "compute_lower")] -#[cfg_attr(docsrs, doc(cfg(feature = "compute_lower")))] -pub mod lower; #[cfg(feature = "compute_merge_sort")] #[cfg_attr(docsrs, doc(cfg(feature = "compute_merge_sort")))] pub mod merge_sort; @@ -84,9 +81,9 @@ pub mod take; #[cfg(feature = "compute_temporal")] #[cfg_attr(docsrs, doc(cfg(feature = "compute_temporal")))] pub mod temporal; -#[cfg(feature = "compute_upper")] -#[cfg_attr(docsrs, doc(cfg(feature = "compute_upper")))] -pub mod upper; +#[cfg(feature = "compute_utf8")] +#[cfg_attr(docsrs, doc(cfg(feature = "compute_utf8")))] +pub mod utf8; mod utils; #[cfg(feature = "compute_window")] #[cfg_attr(docsrs, doc(cfg(feature = "compute_window")))] diff --git a/src/compute/regex_match.rs b/src/compute/regex_match.rs index 3cdc94cfad1..5866a4310d9 100644 --- a/src/compute/regex_match.rs +++ b/src/compute/regex_match.rs @@ -4,7 +4,7 @@ use std::collections::HashMap; use regex::Regex; -use super::utils::{combine_validities, unary_utf8_boolean}; +use super::utils::combine_validities; use crate::array::{BooleanArray, Offset, Utf8Array}; use crate::bitmap::Bitmap; use crate::datatypes::DataType; @@ -69,3 +69,19 @@ pub fn regex_match_scalar(values: &Utf8Array, regex: &str) -> Resu .map_err(|e| ArrowError::InvalidArgumentError(format!("Unable to compile regex: {}", e)))?; Ok(unary_utf8_boolean(values, |x| regex.is_match(x))) } + +fn unary_utf8_boolean bool>( + values: &Utf8Array, + op: F, +) -> BooleanArray { + let validity = values.validity().cloned(); + + let iterator = values.iter().map(|value| { + if value.is_none() { + return false; + }; + op(value.unwrap()) + }); + let values = Bitmap::from_trusted_len_iter(iterator); + BooleanArray::from_data(DataType::Boolean, values, validity) +} diff --git a/src/compute/upper.rs b/src/compute/upper.rs deleted file mode 100644 index 6afbdd09286..00000000000 --- a/src/compute/upper.rs +++ /dev/null @@ -1,67 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Defines kernel to extract a upper case of a \[Large\]StringArray - -use super::utils::utf8_apply; -use crate::array::*; -use crate::{ - datatypes::DataType, - error::{ArrowError, Result}, -}; - -/// Returns a new `Array` where each of each of the elements is upper-cased. -/// this function errors when the passed array is not a \[Large\]String array. -pub fn upper(array: &dyn Array) -> Result> { - match array.data_type() { - DataType::LargeUtf8 => Ok(Box::new(utf8_apply( - str::to_uppercase, - array - .as_any() - .downcast_ref::>() - .expect("A large string is expected"), - ))), - DataType::Utf8 => Ok(Box::new(utf8_apply( - str::to_uppercase, - array - .as_any() - .downcast_ref::>() - .expect("A string is expected"), - ))), - _ => Err(ArrowError::InvalidArgumentError(format!( - "upper does not support type {:?}", - array.data_type() - ))), - } -} - -/// Checks if an array of type `datatype` can perform upper operation -/// -/// # Examples -/// ``` -/// use arrow2::compute::upper::can_upper; -/// use arrow2::datatypes::{DataType}; -/// -/// let data_type = DataType::Utf8; -/// assert_eq!(can_upper(&data_type), true); -/// -/// let data_type = DataType::Null; -/// assert_eq!(can_upper(&data_type), false); -/// ``` -pub fn can_upper(data_type: &DataType) -> bool { - matches!(data_type, DataType::LargeUtf8 | DataType::Utf8) -} diff --git a/src/compute/utf8.rs b/src/compute/utf8.rs new file mode 100644 index 00000000000..1b252e47505 --- /dev/null +++ b/src/compute/utf8.rs @@ -0,0 +1,99 @@ +//! Defines common maps to a [`Utf8Array`] + +use crate::{ + array::{Array, Offset, Utf8Array}, + datatypes::DataType, + error::{ArrowError, Result}, +}; + +/// utf8_apply will apply `Fn(&str) -> String` to every value in Utf8Array. +pub fn utf8_apply String>(f: F, array: &Utf8Array) -> Utf8Array { + let iter = array.values_iter().map(f); + + let new = Utf8Array::::from_trusted_len_values_iter(iter); + new.with_validity(array.validity().cloned()) +} + +/// Returns a new `Array` where each of each of the elements is upper-cased. +/// this function errors when the passed array is not a \[Large\]String array. +pub fn upper(array: &dyn Array) -> Result> { + match array.data_type() { + DataType::LargeUtf8 => Ok(Box::new(utf8_apply( + str::to_uppercase, + array + .as_any() + .downcast_ref::>() + .expect("A large string is expected"), + ))), + DataType::Utf8 => Ok(Box::new(utf8_apply( + str::to_uppercase, + array + .as_any() + .downcast_ref::>() + .expect("A string is expected"), + ))), + _ => Err(ArrowError::InvalidArgumentError(format!( + "upper does not support type {:?}", + array.data_type() + ))), + } +} + +/// Checks if an array of type `datatype` can perform upper operation +/// +/// # Examples +/// ``` +/// use arrow2::compute::utf8::can_upper; +/// use arrow2::datatypes::{DataType}; +/// +/// let data_type = DataType::Utf8; +/// assert_eq!(can_upper(&data_type), true); +/// +/// let data_type = DataType::Null; +/// assert_eq!(can_upper(&data_type), false); +/// ``` +pub fn can_upper(data_type: &DataType) -> bool { + matches!(data_type, DataType::LargeUtf8 | DataType::Utf8) +} + +/// Returns a new `Array` where each of each of the elements is lower-cased. +/// this function errors when the passed array is not a \[Large\]String array. +pub fn lower(array: &dyn Array) -> Result> { + match array.data_type() { + DataType::LargeUtf8 => Ok(Box::new(utf8_apply( + str::to_lowercase, + array + .as_any() + .downcast_ref::>() + .expect("A large string is expected"), + ))), + DataType::Utf8 => Ok(Box::new(utf8_apply( + str::to_lowercase, + array + .as_any() + .downcast_ref::>() + .expect("A string is expected"), + ))), + _ => Err(ArrowError::InvalidArgumentError(format!( + "lower does not support type {:?}", + array.data_type() + ))), + } +} + +/// Checks if an array of type `datatype` can perform lower operation +/// +/// # Examples +/// ``` +/// use arrow2::compute::utf8::can_lower; +/// use arrow2::datatypes::{DataType}; +/// +/// let data_type = DataType::Utf8; +/// assert_eq!(can_lower(&data_type), true); +/// +/// let data_type = DataType::Null; +/// assert_eq!(can_lower(&data_type), false); +/// ``` +pub fn can_lower(data_type: &DataType) -> bool { + matches!(data_type, DataType::LargeUtf8 | DataType::Utf8) +} diff --git a/src/compute/utils.rs b/src/compute/utils.rs index 269a9c9ce37..ea9e44d2896 100644 --- a/src/compute/utils.rs +++ b/src/compute/utils.rs @@ -1,7 +1,6 @@ use crate::{ - array::{Array, BooleanArray, Offset, Utf8Array}, + array::Array, bitmap::Bitmap, - datatypes::DataType, error::{ArrowError, Result}, }; @@ -14,30 +13,6 @@ pub fn combine_validities(lhs: Option<&Bitmap>, rhs: Option<&Bitmap>) -> Option< } } -pub fn unary_utf8_boolean bool>( - values: &Utf8Array, - op: F, -) -> BooleanArray { - let validity = values.validity().cloned(); - - let iterator = values.iter().map(|value| { - if value.is_none() { - return false; - }; - op(value.unwrap()) - }); - let values = Bitmap::from_trusted_len_iter(iterator); - BooleanArray::from_data(DataType::Boolean, values, validity) -} - -/// utf8_apply will apply `Fn(&str) -> String` to every value in Utf8Array. -pub fn utf8_apply String>(f: F, array: &Utf8Array) -> Utf8Array { - let iter = array.values_iter().map(f); - - let new = Utf8Array::::from_trusted_len_values_iter(iter); - new.with_validity(array.validity().cloned()) -} - // Errors iff the two arrays have a different length. #[inline] pub fn check_same_len(lhs: &dyn Array, rhs: &dyn Array) -> Result<()> { diff --git a/src/error.rs b/src/error.rs index aca64d3d659..22faa164c35 100644 --- a/src/error.rs +++ b/src/error.rs @@ -34,6 +34,7 @@ impl ArrowError { Self::OutOfSpec(msg.into()) } + #[allow(dead_code)] pub(crate) fn nyi>(msg: A) -> Self { Self::NotYetImplemented(msg.into()) } diff --git a/tests/it/compute/lower.rs b/tests/it/compute/lower.rs deleted file mode 100644 index d8f594174fb..00000000000 --- a/tests/it/compute/lower.rs +++ /dev/null @@ -1,186 +0,0 @@ -use arrow2::{array::*, compute::lower::*, error::Result}; - -fn with_nulls_utf8() -> Result<()> { - let cases = vec![ - // identity - ( - vec![Some("hello"), None, Some("world")], - vec![Some("hello"), None, Some("world")], - ), - // part of input - ( - vec![Some("Hello"), None, Some("wOrld")], - vec![Some("hello"), None, Some("world")], - ), - // all input - ( - vec![Some("HELLO"), None, Some("WORLD")], - vec![Some("hello"), None, Some("world")], - ), - // UTF8 characters - ( - vec![ - None, - Some("السلام عليكم"), - Some("Dobrý den"), - Some("שָׁלוֹם"), - Some("नमस्ते"), - Some("こんにちは"), - Some("안녕하세요"), - Some("你好"), - Some("Olá"), - Some("Здравствуйте"), - Some("Hola"), - ], - vec![ - None, - Some("السلام عليكم"), - Some("dobrý den"), - Some("שָׁלוֹם"), - Some("नमस्ते"), - Some("こんにちは"), - Some("안녕하세요"), - Some("你好"), - Some("olá"), - Some("здравствуйте"), - Some("hola"), - ], - ), - ]; - - cases - .into_iter() - .try_for_each::<_, Result<()>>(|(array, expected)| { - let array = Utf8Array::::from(&array); - let result = lower(&array)?; - assert_eq!(array.len(), result.len()); - - let result = result.as_any().downcast_ref::>().unwrap(); - let expected = Utf8Array::::from(&expected); - - assert_eq!(&expected, result); - Ok(()) - })?; - - Ok(()) -} - -#[test] -fn with_nulls_string() -> Result<()> { - with_nulls_utf8::() -} - -#[test] -fn with_nulls_large_string() -> Result<()> { - with_nulls_utf8::() -} - -fn without_nulls_utf8() -> Result<()> { - let cases = vec![ - // identity - (vec!["hello", "world"], vec!["hello", "world"]), - // part of input - (vec!["Hello", "wOrld"], vec!["hello", "world"]), - // all input - (vec!["HELLO", "WORLD"], vec!["hello", "world"]), - // UTF8 characters - ( - vec![ - "السلام عليكم", - "Dobrý den", - "שָׁלוֹם", - "नमस्ते", - "こんにちは", - "안녕하세요", - "你好", - "Olá", - "Здравствуйте", - "Hola", - ], - vec![ - "السلام عليكم", - "dobrý den", - "שָׁלוֹם", - "नमस्ते", - "こんにちは", - "안녕하세요", - "你好", - "olá", - "здравствуйте", - "hola", - ], - ), - ]; - - cases - .into_iter() - .try_for_each::<_, Result<()>>(|(array, expected)| { - let array = Utf8Array::::from_slice(&array); - let result = lower(&array)?; - assert_eq!(array.len(), result.len()); - - let result = result.as_any().downcast_ref::>().unwrap(); - let expected = Utf8Array::::from_slice(&expected); - assert_eq!(&expected, result); - Ok(()) - })?; - - Ok(()) -} - -#[test] -fn without_nulls_string() -> Result<()> { - without_nulls_utf8::() -} - -#[test] -fn without_nulls_large_string() -> Result<()> { - without_nulls_utf8::() -} - -#[test] -fn consistency() { - use arrow2::datatypes::DataType::*; - use arrow2::datatypes::TimeUnit; - let datatypes = vec![ - Null, - Boolean, - UInt8, - UInt16, - UInt32, - UInt64, - Int8, - Int16, - Int32, - Int64, - Float32, - Float64, - Timestamp(TimeUnit::Second, None), - Timestamp(TimeUnit::Millisecond, None), - Timestamp(TimeUnit::Microsecond, None), - Timestamp(TimeUnit::Nanosecond, None), - Time64(TimeUnit::Microsecond), - Time64(TimeUnit::Nanosecond), - Date32, - Time32(TimeUnit::Second), - Time32(TimeUnit::Millisecond), - Date64, - Utf8, - LargeUtf8, - Binary, - LargeBinary, - Duration(TimeUnit::Second), - Duration(TimeUnit::Millisecond), - Duration(TimeUnit::Microsecond), - Duration(TimeUnit::Nanosecond), - ]; - - datatypes.into_iter().for_each(|d1| { - let array = new_null_array(d1.clone(), 10); - if can_lower(&d1) { - assert!(lower(array.as_ref()).is_ok()); - } else { - assert!(lower(array.as_ref()).is_err()); - } - }); -} diff --git a/tests/it/compute/mod.rs b/tests/it/compute/mod.rs index 9b5079be712..f4972c21940 100644 --- a/tests/it/compute/mod.rs +++ b/tests/it/compute/mod.rs @@ -28,8 +28,6 @@ mod length; mod like; #[cfg(feature = "compute_limit")] mod limit; -#[cfg(feature = "compute_lower")] -mod lower; #[cfg(feature = "compute_merge_sort")] mod merge_sort; #[cfg(feature = "compute_partition")] @@ -44,7 +42,7 @@ mod substring; mod take; #[cfg(feature = "compute_temporal")] mod temporal; -#[cfg(feature = "compute_upper")] -mod upper; +#[cfg(feature = "compute_utf8")] +mod utf8; #[cfg(feature = "compute_window")] mod window; diff --git a/tests/it/compute/upper.rs b/tests/it/compute/upper.rs deleted file mode 100644 index 4daf35626a9..00000000000 --- a/tests/it/compute/upper.rs +++ /dev/null @@ -1,186 +0,0 @@ -use arrow2::{array::*, compute::upper::*, error::Result}; - -fn with_nulls_utf8() -> Result<()> { - let cases = vec![ - // identity - ( - vec![Some("hello"), None, Some("world")], - vec![Some("HELLO"), None, Some("WORLD")], - ), - // part of input - ( - vec![Some("Hello"), None, Some("wOrld")], - vec![Some("HELLO"), None, Some("WORLD")], - ), - // all input - ( - vec![Some("hello"), None, Some("world")], - vec![Some("HELLO"), None, Some("WORLD")], - ), - // UTF8 characters - ( - vec![ - None, - Some("السلام عليكم"), - Some("Dobrý den"), - Some("שָׁלוֹם"), - Some("नमस्ते"), - Some("こんにちは"), - Some("안녕하세요"), - Some("你好"), - Some("Olá"), - Some("Здравствуйте"), - Some("Hola"), - ], - vec![ - None, - Some("السلام عليكم"), - Some("DOBRÝ DEN"), - Some("שָׁלוֹם"), - Some("नमस्ते"), - Some("こんにちは"), - Some("안녕하세요"), - Some("你好"), - Some("OLÁ"), - Some("ЗДРАВСТВУЙТЕ"), - Some("HOLA"), - ], - ), - ]; - - cases - .into_iter() - .try_for_each::<_, Result<()>>(|(array, expected)| { - let array = Utf8Array::::from(&array); - let result = upper(&array)?; - assert_eq!(array.len(), result.len()); - - let result = result.as_any().downcast_ref::>().unwrap(); - let expected = Utf8Array::::from(&expected); - - assert_eq!(&expected, result); - Ok(()) - })?; - - Ok(()) -} - -#[test] -fn with_nulls_string() -> Result<()> { - with_nulls_utf8::() -} - -#[test] -fn with_nulls_large_string() -> Result<()> { - with_nulls_utf8::() -} - -fn without_nulls_utf8() -> Result<()> { - let cases = vec![ - // identity - (vec!["hello", "world"], vec!["HELLO", "WORLD"]), - // part of input - (vec!["Hello", "wOrld"], vec!["HELLO", "WORLD"]), - // all input - (vec!["HELLO", "WORLD"], vec!["HELLO", "WORLD"]), - // UTF8 characters - ( - vec![ - "السلام عليكم", - "Dobrý den", - "שָׁלוֹם", - "नमस्ते", - "こんにちは", - "안녕하세요", - "你好", - "Olá", - "Здравствуйте", - "Hola", - ], - vec![ - "السلام عليكم", - "DOBRÝ DEN", - "שָׁלוֹם", - "नमस्ते", - "こんにちは", - "안녕하세요", - "你好", - "OLÁ", - "ЗДРАВСТВУЙТЕ", - "HOLA", - ], - ), - ]; - - cases - .into_iter() - .try_for_each::<_, Result<()>>(|(array, expected)| { - let array = Utf8Array::::from_slice(&array); - let result = upper(&array)?; - assert_eq!(array.len(), result.len()); - - let result = result.as_any().downcast_ref::>().unwrap(); - let expected = Utf8Array::::from_slice(&expected); - assert_eq!(&expected, result); - Ok(()) - })?; - - Ok(()) -} - -#[test] -fn without_nulls_string() -> Result<()> { - without_nulls_utf8::() -} - -#[test] -fn without_nulls_large_string() -> Result<()> { - without_nulls_utf8::() -} - -#[test] -fn consistency() { - use arrow2::datatypes::DataType::*; - use arrow2::datatypes::TimeUnit; - let datatypes = vec![ - Null, - Boolean, - UInt8, - UInt16, - UInt32, - UInt64, - Int8, - Int16, - Int32, - Int64, - Float32, - Float64, - Timestamp(TimeUnit::Second, None), - Timestamp(TimeUnit::Millisecond, None), - Timestamp(TimeUnit::Microsecond, None), - Timestamp(TimeUnit::Nanosecond, None), - Time64(TimeUnit::Microsecond), - Time64(TimeUnit::Nanosecond), - Date32, - Time32(TimeUnit::Second), - Time32(TimeUnit::Millisecond), - Date64, - Utf8, - LargeUtf8, - Binary, - LargeBinary, - Duration(TimeUnit::Second), - Duration(TimeUnit::Millisecond), - Duration(TimeUnit::Microsecond), - Duration(TimeUnit::Nanosecond), - ]; - - datatypes.into_iter().for_each(|d1| { - let array = new_null_array(d1.clone(), 10); - if can_upper(&d1) { - assert!(upper(array.as_ref()).is_ok()); - } else { - assert!(upper(array.as_ref()).is_err()); - } - }); -} diff --git a/tests/it/compute/utf8.rs b/tests/it/compute/utf8.rs new file mode 100644 index 00000000000..864dc0eca27 --- /dev/null +++ b/tests/it/compute/utf8.rs @@ -0,0 +1,371 @@ +use arrow2::{array::*, compute::utf8::*, error::Result}; + +fn with_nulls_utf8_lower() -> Result<()> { + let cases = vec![ + // identity + ( + vec![Some("hello"), None, Some("world")], + vec![Some("hello"), None, Some("world")], + ), + // part of input + ( + vec![Some("Hello"), None, Some("wOrld")], + vec![Some("hello"), None, Some("world")], + ), + // all input + ( + vec![Some("HELLO"), None, Some("WORLD")], + vec![Some("hello"), None, Some("world")], + ), + // UTF8 characters + ( + vec![ + None, + Some("السلام عليكم"), + Some("Dobrý den"), + Some("שָׁלוֹם"), + Some("नमस्ते"), + Some("こんにちは"), + Some("안녕하세요"), + Some("你好"), + Some("Olá"), + Some("Здравствуйте"), + Some("Hola"), + ], + vec![ + None, + Some("السلام عليكم"), + Some("dobrý den"), + Some("שָׁלוֹם"), + Some("नमस्ते"), + Some("こんにちは"), + Some("안녕하세요"), + Some("你好"), + Some("olá"), + Some("здравствуйте"), + Some("hola"), + ], + ), + ]; + + cases + .into_iter() + .try_for_each::<_, Result<()>>(|(array, expected)| { + let array = Utf8Array::::from(&array); + let result = lower(&array)?; + assert_eq!(array.len(), result.len()); + + let result = result.as_any().downcast_ref::>().unwrap(); + let expected = Utf8Array::::from(&expected); + + assert_eq!(&expected, result); + Ok(()) + })?; + + Ok(()) +} + +#[test] +fn test_lower() -> Result<()> { + with_nulls_utf8_lower::() +} + +#[test] +fn test_large_lower() -> Result<()> { + with_nulls_utf8_lower::() +} + +fn without_nulls_utf8_lower() -> Result<()> { + let cases = vec![ + // identity + (vec!["hello", "world"], vec!["hello", "world"]), + // part of input + (vec!["Hello", "wOrld"], vec!["hello", "world"]), + // all input + (vec!["HELLO", "WORLD"], vec!["hello", "world"]), + // UTF8 characters + ( + vec![ + "السلام عليكم", + "Dobrý den", + "שָׁלוֹם", + "नमस्ते", + "こんにちは", + "안녕하세요", + "你好", + "Olá", + "Здравствуйте", + "Hola", + ], + vec![ + "السلام عليكم", + "dobrý den", + "שָׁלוֹם", + "नमस्ते", + "こんにちは", + "안녕하세요", + "你好", + "olá", + "здравствуйте", + "hola", + ], + ), + ]; + + cases + .into_iter() + .try_for_each::<_, Result<()>>(|(array, expected)| { + let array = Utf8Array::::from_slice(&array); + let result = lower(&array)?; + assert_eq!(array.len(), result.len()); + + let result = result.as_any().downcast_ref::>().unwrap(); + let expected = Utf8Array::::from_slice(&expected); + assert_eq!(&expected, result); + Ok(()) + })?; + + Ok(()) +} + +#[test] +fn without_nulls_string_lower() -> Result<()> { + without_nulls_utf8_lower::() +} + +#[test] +fn without_nulls_large_string_lower() -> Result<()> { + without_nulls_utf8_lower::() +} + +#[test] +fn consistency_lower() { + use arrow2::datatypes::DataType::*; + use arrow2::datatypes::TimeUnit; + let datatypes = vec![ + Null, + Boolean, + UInt8, + UInt16, + UInt32, + UInt64, + Int8, + Int16, + Int32, + Int64, + Float32, + Float64, + Timestamp(TimeUnit::Second, None), + Timestamp(TimeUnit::Millisecond, None), + Timestamp(TimeUnit::Microsecond, None), + Timestamp(TimeUnit::Nanosecond, None), + Time64(TimeUnit::Microsecond), + Time64(TimeUnit::Nanosecond), + Date32, + Time32(TimeUnit::Second), + Time32(TimeUnit::Millisecond), + Date64, + Utf8, + LargeUtf8, + Binary, + LargeBinary, + Duration(TimeUnit::Second), + Duration(TimeUnit::Millisecond), + Duration(TimeUnit::Microsecond), + Duration(TimeUnit::Nanosecond), + ]; + + datatypes.into_iter().for_each(|d1| { + let array = new_null_array(d1.clone(), 10); + if can_lower(&d1) { + assert!(lower(array.as_ref()).is_ok()); + } else { + assert!(lower(array.as_ref()).is_err()); + } + }); +} + +fn with_nulls_utf8() -> Result<()> { + let cases = vec![ + // identity + ( + vec![Some("hello"), None, Some("world")], + vec![Some("HELLO"), None, Some("WORLD")], + ), + // part of input + ( + vec![Some("Hello"), None, Some("wOrld")], + vec![Some("HELLO"), None, Some("WORLD")], + ), + // all input + ( + vec![Some("hello"), None, Some("world")], + vec![Some("HELLO"), None, Some("WORLD")], + ), + // UTF8 characters + ( + vec![ + None, + Some("السلام عليكم"), + Some("Dobrý den"), + Some("שָׁלוֹם"), + Some("नमस्ते"), + Some("こんにちは"), + Some("안녕하세요"), + Some("你好"), + Some("Olá"), + Some("Здравствуйте"), + Some("Hola"), + ], + vec![ + None, + Some("السلام عليكم"), + Some("DOBRÝ DEN"), + Some("שָׁלוֹם"), + Some("नमस्ते"), + Some("こんにちは"), + Some("안녕하세요"), + Some("你好"), + Some("OLÁ"), + Some("ЗДРАВСТВУЙТЕ"), + Some("HOLA"), + ], + ), + ]; + + cases + .into_iter() + .try_for_each::<_, Result<()>>(|(array, expected)| { + let array = Utf8Array::::from(&array); + let result = upper(&array)?; + assert_eq!(array.len(), result.len()); + + let result = result.as_any().downcast_ref::>().unwrap(); + let expected = Utf8Array::::from(&expected); + + assert_eq!(&expected, result); + Ok(()) + })?; + + Ok(()) +} + +#[test] +fn with_nulls_string() -> Result<()> { + with_nulls_utf8::() +} + +#[test] +fn with_nulls_large_string() -> Result<()> { + with_nulls_utf8::() +} + +fn without_nulls_utf8() -> Result<()> { + let cases = vec![ + // identity + (vec!["hello", "world"], vec!["HELLO", "WORLD"]), + // part of input + (vec!["Hello", "wOrld"], vec!["HELLO", "WORLD"]), + // all input + (vec!["HELLO", "WORLD"], vec!["HELLO", "WORLD"]), + // UTF8 characters + ( + vec![ + "السلام عليكم", + "Dobrý den", + "שָׁלוֹם", + "नमस्ते", + "こんにちは", + "안녕하세요", + "你好", + "Olá", + "Здравствуйте", + "Hola", + ], + vec![ + "السلام عليكم", + "DOBRÝ DEN", + "שָׁלוֹם", + "नमस्ते", + "こんにちは", + "안녕하세요", + "你好", + "OLÁ", + "ЗДРАВСТВУЙТЕ", + "HOLA", + ], + ), + ]; + + cases + .into_iter() + .try_for_each::<_, Result<()>>(|(array, expected)| { + let array = Utf8Array::::from_slice(&array); + let result = upper(&array)?; + assert_eq!(array.len(), result.len()); + + let result = result.as_any().downcast_ref::>().unwrap(); + let expected = Utf8Array::::from_slice(&expected); + assert_eq!(&expected, result); + Ok(()) + })?; + + Ok(()) +} + +#[test] +fn without_nulls_string() -> Result<()> { + without_nulls_utf8::() +} + +#[test] +fn without_nulls_large_string() -> Result<()> { + without_nulls_utf8::() +} + +#[test] +fn consistency_upper() { + use arrow2::datatypes::DataType::*; + use arrow2::datatypes::TimeUnit; + let datatypes = vec![ + Null, + Boolean, + UInt8, + UInt16, + UInt32, + UInt64, + Int8, + Int16, + Int32, + Int64, + Float32, + Float64, + Timestamp(TimeUnit::Second, None), + Timestamp(TimeUnit::Millisecond, None), + Timestamp(TimeUnit::Microsecond, None), + Timestamp(TimeUnit::Nanosecond, None), + Time64(TimeUnit::Microsecond), + Time64(TimeUnit::Nanosecond), + Date32, + Time32(TimeUnit::Second), + Time32(TimeUnit::Millisecond), + Date64, + Utf8, + LargeUtf8, + Binary, + LargeBinary, + Duration(TimeUnit::Second), + Duration(TimeUnit::Millisecond), + Duration(TimeUnit::Microsecond), + Duration(TimeUnit::Nanosecond), + ]; + + datatypes.into_iter().for_each(|d1| { + let array = new_null_array(d1.clone(), 10); + if can_upper(&d1) { + assert!(upper(array.as_ref()).is_ok()); + } else { + assert!(upper(array.as_ref()).is_err()); + } + }); +}