From a810b03f148cc915457fabbe4bfbe22f0241aca2 Mon Sep 17 00:00:00 2001 From: Remzi Yang <59198230+HaoYang670@users.noreply.github.com> Date: Mon, 23 May 2022 12:10:57 +0800 Subject: [PATCH] Simplified `min_max_string` and `min_max_binary` (#1004) --- benches/aggregate.rs | 12 +++ src/compute/aggregate/min_max.rs | 116 +++++++------------------- tests/it/compute/aggregate/min_max.rs | 28 +++++-- 3 files changed, 67 insertions(+), 89 deletions(-) diff --git a/benches/aggregate.rs b/benches/aggregate.rs index d515fff1281..f157550d153 100644 --- a/benches/aggregate.rs +++ b/benches/aggregate.rs @@ -42,6 +42,18 @@ fn add_benchmark(c: &mut Criterion) { c.bench_function(&format!("min null 2^{} f32", log2_size), |b| { b.iter(|| bench_min(&arr_a)) }); + + let arr_a = create_string_array::(1, size, 0.0, 0); + + c.bench_function(&format!("min 2^{} utf8", log2_size), |b| { + b.iter(|| bench_min(&arr_a)) + }); + + let arr_a = create_string_array::(1, size, 0.1, 0); + + c.bench_function(&format!("min null 2^{} utf8", log2_size), |b| { + b.iter(|| bench_min(&arr_a)) + }); }); } diff --git a/src/compute/aggregate/min_max.rs b/src/compute/aggregate/min_max.rs index 2c6478cff48..54c9e7d5048 100644 --- a/src/compute/aggregate/min_max.rs +++ b/src/compute/aggregate/min_max.rs @@ -30,86 +30,6 @@ pub trait SimdOrd { fn new_max() -> Self; } -/// Helper to compute min/max of [`BinaryArray`] -fn min_max_binary bool>( - array: &BinaryArray, - cmp: F, -) -> Option<&[u8]> { - let null_count = array.null_count(); - - if null_count == array.len() || array.len() == 0 { - return None; - } - let value = if array.validity().is_some() { - array.iter().fold(None, |mut acc: Option<&[u8]>, v| { - if let Some(item) = v { - if let Some(acc) = acc.as_mut() { - if cmp(acc, item) { - *acc = item - } - } else { - acc = Some(item) - } - } - acc - }) - } else { - array - .values_iter() - .fold(None, |mut acc: Option<&[u8]>, item| { - if let Some(acc) = acc.as_mut() { - if cmp(acc, item) { - *acc = item - } - } else { - acc = Some(item) - } - acc - }) - }; - value -} - -/// Helper to compute min/max of [`Utf8Array`] -fn min_max_string bool>( - array: &Utf8Array, - cmp: F, -) -> Option<&str> { - let null_count = array.null_count(); - - if null_count == array.len() || array.len() == 0 { - return None; - } - let value = if array.validity().is_some() { - array.iter().fold(None, |mut acc: Option<&str>, v| { - if let Some(item) = v { - if let Some(acc) = acc.as_mut() { - if cmp(acc, item) { - *acc = item - } - } else { - acc = Some(item) - } - } - acc - }) - } else { - array - .values_iter() - .fold(None, |mut acc: Option<&str>, item| { - if let Some(acc) = acc.as_mut() { - if cmp(acc, item) { - *acc = item - } - } else { - acc = Some(item) - } - acc - }) - }; - value -} - fn nonnull_min_primitive(values: &[T]) -> T where T: NativeType + Simd, @@ -278,24 +198,52 @@ where }) } +/// Helper to compute min/max of [`BinaryArray`] and [`Utf8Array`] +macro_rules! min_max_binary_utf8 { + ($array: expr, $cmp: expr) => { + if $array.null_count() == $array.len() { + None + } else if $array.validity().is_some() { + $array + .iter() + .reduce(|v1, v2| match (v1, v2) { + (None, v2) => v2, + (v1, None) => v1, + (Some(v1), Some(v2)) => { + if $cmp(v1, v2) { + Some(v2) + } else { + Some(v1) + } + } + }) + .unwrap_or(None) + } else { + $array + .values_iter() + .reduce(|v1, v2| if $cmp(v1, v2) { v2 } else { v1 }) + } + }; +} + /// Returns the maximum value in the binary array, according to the natural order. pub fn max_binary(array: &BinaryArray) -> Option<&[u8]> { - min_max_binary(array, |a, b| a < b) + min_max_binary_utf8!(array, |a, b| a < b) } /// Returns the minimum value in the binary array, according to the natural order. pub fn min_binary(array: &BinaryArray) -> Option<&[u8]> { - min_max_binary(array, |a, b| a > b) + min_max_binary_utf8!(array, |a, b| a > b) } /// Returns the maximum value in the string array, according to the natural order. pub fn max_string(array: &Utf8Array) -> Option<&str> { - min_max_string(array, |a, b| a < b) + min_max_binary_utf8!(array, |a, b| a < b) } /// Returns the minimum value in the string array, according to the natural order. pub fn min_string(array: &Utf8Array) -> Option<&str> { - min_max_string(array, |a, b| a > b) + min_max_binary_utf8!(array, |a, b| a > b) } /// Returns the minimum value in the boolean array. diff --git a/tests/it/compute/aggregate/min_max.rs b/tests/it/compute/aggregate/min_max.rs index 224a245ae79..3cfc8b675f0 100644 --- a/tests/it/compute/aggregate/min_max.rs +++ b/tests/it/compute/aggregate/min_max.rs @@ -112,12 +112,11 @@ fn min_max_f64_edge_cases() { assert_eq!(Some(f64::INFINITY), max_primitive(&a)); } -// todo: convert me #[test] fn test_string_min_max_with_nulls() { let a = Utf8Array::::from(&[Some("b"), None, None, Some("a"), Some("c")]); - assert_eq!("a", min_string(&a).unwrap()); - assert_eq!("c", max_string(&a).unwrap()); + assert_eq!(Some("a"), min_string(&a)); + assert_eq!(Some("c"), max_string(&a)); } #[test] @@ -127,6 +126,13 @@ fn test_string_min_max_all_nulls() { assert_eq!(None, max_string(&a)); } +#[test] +fn test_string_min_max_no_null() { + let a = Utf8Array::::from(&[Some("abc"), Some("abd"), Some("bac"), Some("bbb")]); + assert_eq!(Some("abc"), min_string(&a)); + assert_eq!(Some("bbb"), max_string(&a)); +} + #[test] fn test_string_min_max_1() { let a = Utf8Array::::from(&[None, None, Some("b"), Some("a")]); @@ -192,8 +198,20 @@ fn test_boolean_min_max_smaller() { #[test] fn test_binary_min_max_with_nulls() { let a = BinaryArray::::from(&[Some(b"b"), None, None, Some(b"a"), Some(b"c")]); - assert_eq!("a".as_bytes(), min_binary(&a).unwrap()); - assert_eq!("c".as_bytes(), max_binary(&a).unwrap()); + assert_eq!(Some("a".as_bytes()), min_binary(&a)); + assert_eq!(Some("c".as_bytes()), max_binary(&a)); +} + +#[test] +fn test_binary_min_max_no_null() { + let a = BinaryArray::::from(&[ + Some("abc".as_bytes()), + Some(b"acd"), + Some(b"aabd"), + Some(b""), + ]); + assert_eq!(Some("".as_bytes()), min_binary(&a)); + assert_eq!(Some("acd".as_bytes()), max_binary(&a)); } #[test]