Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Fix escaped like wildcards #1204

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ csv-core = { version = "0.1", optional = true }
csv-async = { version = "^1.1", optional = true }

regex = { version = "^1.3", optional = true }
regex-syntax = { version = "^0.6", optional = true }
streaming-iterator = { version = "0.1", optional = true }
fallible-streaming-iterator = { version = "0.1", optional = true }

Expand Down Expand Up @@ -135,6 +136,7 @@ full = [
"io_avro_compression",
"io_avro_async",
"regex",
"regex-syntax",
"compute",
# parses timezones used in timestamp conversions
"chrono-tz",
Expand Down Expand Up @@ -190,7 +192,7 @@ compute_filter = []
compute_hash = ["multiversion"]
compute_if_then_else = []
compute_length = []
compute_like = ["regex"]
compute_like = ["regex", "regex-syntax"]
compute_limit = []
compute_merge_sort = ["itertools", "compute_sort"]
compute_nullif = ["compute_comparison"]
Expand Down
45 changes: 42 additions & 3 deletions src/compute/like.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,41 @@ fn is_like_pattern(c: char) -> bool {
c == '%' || c == '_'
}

/// Transforms a like `pattern` to a regex compatible pattern. To achieve that, it does:
///
/// 1. Replace like wildcards for regex expressions as the pattern will be evaluated using regex match: `%` => `.*` and `_` => `.`
/// 2. Escape regex meta characters to match them and not be evaluated as regex special chars. For example: `.` => `\\.`
/// 3. Replace escaped like wildcards removing the escape characters to be able to match it as a regex. For example: `\\%` => `%`
fn replace_pattern(pattern: &str) -> String {
pattern.replace('%', ".*").replace('_', ".")
let mut result = String::new();
let text = String::from(pattern);
let mut chars_iter = text.chars().peekable();
while let Some(c) = chars_iter.next() {
if c == '\\' {
let next = chars_iter.peek();
match next {
Some(next) if is_like_pattern(*next) => {
result.push(*next);
// Skipping the next char as it is already appended
chars_iter.next();
}
_ => {
result.push('\\');
result.push('\\');
}
}
} else if regex_syntax::is_meta_character(c) {
result.push('\\');
result.push(c);
} else if c == '%' {
result.push_str(".*");
} else if c == '_' {
result.push('.');
} else {
result.push(c);
}
}
result
}

#[inline]
Expand Down Expand Up @@ -108,7 +141,10 @@ fn a_like_utf8_scalar<O: Offset, F: Fn(bool) -> bool>(

let values = if !rhs.contains(is_like_pattern) {
Bitmap::from_trusted_len_iter(lhs.values_iter().map(|x| op(x == rhs)))
} else if rhs.ends_with('%') && !rhs[..rhs.len() - 1].contains(is_like_pattern) {
} else if rhs.ends_with('%')
&& !rhs.ends_with("\\%")
&& !rhs[..rhs.len() - 1].contains(is_like_pattern)
{
// fast path, can use starts_with
let starts_with = &rhs[..rhs.len() - 1];
Bitmap::from_trusted_len_iter(lhs.values_iter().map(|x| op(x.starts_with(starts_with))))
Expand Down Expand Up @@ -260,7 +296,10 @@ fn a_like_binary_scalar<O: Offset, F: Fn(bool) -> bool>(

let values = if !pattern.contains(is_like_pattern) {
Bitmap::from_trusted_len_iter(lhs.values_iter().map(|x| op(x == rhs)))
} else if pattern.ends_with('%') && !pattern[..pattern.len() - 1].contains(is_like_pattern) {
} else if pattern.ends_with('%')
&& !pattern.ends_with("\\%")
&& !pattern[..pattern.len() - 1].contains(is_like_pattern)
{
// fast path, can use starts_with
let starts_with = &rhs[..rhs.len() - 1];
Bitmap::from_trusted_len_iter(lhs.values_iter().map(|x| op(x.starts_with(starts_with))))
Expand Down
22 changes: 22 additions & 0 deletions tests/it/compute/like.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,28 @@ fn test_like_binary_scalar() -> Result<()> {
Ok(())
}

#[test]
fn test_like_utf8_scalar() -> Result<()> {
let array = Utf8Array::<i32>::from_slice(&["Arrow", "Arrow", "Arrow", "BA"]);

let result = like_utf8_scalar(&array, "A%").unwrap();
assert_eq!(result, BooleanArray::from_slice(&[true, true, true, false]));

let result = like_utf8_scalar(&array, "Arrow").unwrap();
assert_eq!(result, BooleanArray::from_slice(&[true, true, true, false]));

let array = Utf8Array::<i32>::from_slice(&["A%", "Arrow"]);

let result = like_utf8_scalar(&array, "A\\%").unwrap();
assert_eq!(result, BooleanArray::from_slice(&[true, false]));

let array = Utf8Array::<i32>::from_slice(&["A_row", "Arrow"]);
let result = like_utf8_scalar(&array, "A\\_row").unwrap();
assert_eq!(result, BooleanArray::from_slice(&[true, false]));

Ok(())
}

#[test]
fn test_nlike_binary_scalar() -> Result<()> {
let array = BinaryArray::<i32>::from_slice(&["Arrow", "Arrow", "Arrow", "BA"]);
Expand Down