Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Commit

Permalink
Improved performance of utf8 comparison (1.7x-4x) (#322)
Browse files Browse the repository at this point in the history
  • Loading branch information
jorgecarleitao committed Aug 24, 2021
1 parent 77a2934 commit 6cc3937
Show file tree
Hide file tree
Showing 8 changed files with 28 additions and 24 deletions.
10 changes: 10 additions & 0 deletions benches/comparison_kernels.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,16 @@ fn add_benchmark(c: &mut Criterion) {
c.bench_function(&format!("bool scalar 2^{}", log2_size), |b| {
b.iter(|| bench_op_scalar(&arr_a, &BooleanScalar::from(Some(true)), Operator::Eq))
});

let arr_a = create_string_array::<i32>(size, 0.1, 42);
let arr_b = create_string_array::<i32>(size, 0.1, 43);
c.bench_function(&format!("utf8 2^{}", log2_size), |b| {
b.iter(|| bench_op(&arr_a, &arr_b, Operator::Eq))
});

c.bench_function(&format!("utf8 2^{}", log2_size), |b| {
b.iter(|| bench_op_scalar(&arr_a, &Utf8Scalar::<i32>::from(Some("abc")), Operator::Eq))
});
})
}

Expand Down
2 changes: 1 addition & 1 deletion benches/filter_kernels.rs
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ fn add_benchmark(c: &mut Criterion) {
b.iter(|| bench_built_filter(&sparse_filter, &data_array))
});

let data_array = create_string_array::<i32>(size, 0.5);
let data_array = create_string_array::<i32>(size, 0.5, 42);
c.bench_function("filter context string", |b| {
b.iter(|| bench_built_filter(&filter, &data_array))
});
Expand Down
2 changes: 1 addition & 1 deletion benches/sort_kernel.rs
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ fn add_benchmark(c: &mut Criterion) {
b.iter(|| bench_lexsort(&arr_a, &arr_b))
});

let arr_a = create_string_array::<i32>(size, 0.1);
let arr_a = create_string_array::<i32>(size, 0.1, 42);
c.bench_function(&format!("sort utf8 null 2^{}", log2_size), |b| {
b.iter(|| bench_sort(&arr_a))
});
Expand Down
12 changes: 6 additions & 6 deletions benches/take_kernels.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,36 +91,36 @@ fn add_benchmark(c: &mut Criterion) {
b.iter(|| bench_take(&values, &indices))
});

let values = create_string_array::<i32>(512, 0.0);
let values = create_string_array::<i32>(512, 0.0, 42);
let indices = create_random_index(512, 0.0);
c.bench_function("take str 512", |b| b.iter(|| bench_take(&values, &indices)));

let values = create_string_array::<i32>(1024, 0.0);
let values = create_string_array::<i32>(1024, 0.0, 42);
let indices = create_random_index(1024, 0.0);
c.bench_function("take str 1024", |b| {
b.iter(|| bench_take(&values, &indices))
});

let values = create_string_array::<i32>(512, 0.0);
let values = create_string_array::<i32>(512, 0.0, 42);
let indices = create_random_index(512, 0.5);
c.bench_function("take str null indices 512", |b| {
b.iter(|| bench_take(&values, &indices))
});

let values = create_string_array::<i32>(1024, 0.0);
let values = create_string_array::<i32>(1024, 0.0, 42);
let indices = create_random_index(1024, 0.5);
c.bench_function("take str null indices 1024", |b| {
b.iter(|| bench_take(&values, &indices))
});

let values = create_string_array::<i32>(1024, 0.5);
let values = create_string_array::<i32>(1024, 0.5, 42);

let indices = create_random_index(1024, 0.0);
c.bench_function("take str null values 1024", |b| {
b.iter(|| bench_take(&values, &indices))
});

let values = create_string_array::<i32>(1024, 0.5);
let values = create_string_array::<i32>(1024, 0.5, 42);
let indices = create_random_index(1024, 0.5);
c.bench_function("take str null values null indices 1024", |b| {
b.iter(|| bench_take(&values, &indices))
Expand Down
2 changes: 1 addition & 1 deletion benches/write_ipc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ fn add_benchmark(c: &mut Criterion) {
});

(0..=10).step_by(2).for_each(|i| {
let array = &create_string_array::<i32>(1024 * 2usize.pow(i), 0.1);
let array = &create_string_array::<i32>(1024 * 2usize.pow(i), 0.1, 42);
let a = format!("write utf8 2^{}", 10 + i);
c.bench_function(&a, |b| b.iter(|| write(array).unwrap()));
});
Expand Down
4 changes: 2 additions & 2 deletions benches/write_parquet.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,13 @@ fn add_benchmark(c: &mut Criterion) {
});

(0..=10).step_by(2).for_each(|i| {
let array = &create_string_array::<i32>(1024 * 2usize.pow(i), 0.1);
let array = &create_string_array::<i32>(1024 * 2usize.pow(i), 0.1, 42);
let a = format!("write utf8 2^{}", 10 + i);
c.bench_function(&a, |b| b.iter(|| write(array, Encoding::Plain).unwrap()));
});

(0..=10).step_by(2).for_each(|i| {
let array = &create_string_array::<i32>(1024 * 2usize.pow(i), 0.1);
let array = &create_string_array::<i32>(1024 * 2usize.pow(i), 0.1, 42);
let a = format!("write utf8 delta 2^{}", 10 + i);
c.bench_function(&a, |b| {
b.iter(|| write(array, Encoding::DeltaLengthByteArray).unwrap())
Expand Down
14 changes: 4 additions & 10 deletions src/compute/comparison/utf8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,9 @@ where
let validity = combine_validities(lhs.validity(), rhs.validity());

let values = lhs
.iter()
.zip(rhs.iter())
.map(|(lhs, rhs)| match (lhs, rhs) {
(Some(lhs), Some(rhs)) => op(lhs, rhs),
_ => false,
});
.values_iter()
.zip(rhs.values_iter())
.map(|(lhs, rhs)| op(lhs, rhs));
let values = Bitmap::from_trusted_len_iter(values);

Ok(BooleanArray::from_data(values, validity))
Expand All @@ -57,10 +54,7 @@ where
{
let validity = lhs.validity().clone();

let values = lhs.iter().map(|lhs| match lhs {
None => false,
Some(lhs) => op(lhs, rhs),
});
let values = lhs.values_iter().map(|lhs| op(lhs, rhs));
let values = Bitmap::from_trusted_len_iter(values);

BooleanArray::from_data(values, validity)
Expand Down
6 changes: 3 additions & 3 deletions src/util/bench_util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -95,15 +95,15 @@ where
}

/// Creates an random (but fixed-seeded) array of a given size and null density
pub fn create_string_array<O: Offset>(size: usize, null_density: f32) -> Utf8Array<O> {
let rng = &mut seedable_rng();
pub fn create_string_array<O: Offset>(size: usize, null_density: f32, seed: u64) -> Utf8Array<O> {
let mut rng = StdRng::seed_from_u64(seed);

(0..size)
.map(|_| {
if rng.gen::<f32>() < null_density {
None
} else {
let value = rng
let value = (&mut rng)
.sample_iter(&Alphanumeric)
.take(4)
.map(char::from)
Expand Down

0 comments on commit 6cc3937

Please sign in to comment.