Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 82 additions & 0 deletions rust/lance-datafusion/src/expr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,16 @@
// will always yield "x = 7_u64" regardless of the type of the column "x". As a result, we
// need to do that literal coercion ourselves.
pub fn safe_coerce_scalar(value: &ScalarValue, ty: &DataType) -> Option<ScalarValue> {
// A dictionary target coerces the value to the dictionary's value type and
// re-wraps it as a dictionary literal. Nulls keep their untyped form,
// matching the existing `ScalarValue::Null` behavior for all targets.
if let DataType::Dictionary(key_type, value_type) = ty {
if value.is_null() {
return Some(value.clone());
Comment on lines +24 to +25
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Preserve dictionary type for typed null literals

When the filter literal is already a typed null, such as an API-built Expr::Literal(ScalarValue::Utf8(None), _) or an IN value that has been typed as Utf8, this early is_null() return leaves the literal as Utf8(None) instead of producing a Dictionary(Int16, Utf8) scalar for a dictionary column. resolve_value then installs that mismatched literal next to the dictionary column, so the same dictionary filter path fixed for non-null strings can still fail planning/evaluation for typed-null predicates; this guard should only preserve the untyped ScalarValue::Null case or coerce the inner null and wrap it in the dictionary type.

Useful? React with 👍 / 👎.

}
let inner = safe_coerce_scalar(value, value_type)?;
return Some(ScalarValue::Dictionary(key_type.clone(), Box::new(inner)));
}
match value {
ScalarValue::Int8(val) => match ty {
DataType::Int8 => Some(value.clone()),
Expand Down Expand Up @@ -436,6 +446,9 @@
DataType::BinaryView => Some(value.clone()),
_ => None,
},
// A dictionary-encoded literal (e.g. produced by DataFusion's dictionary
// cast in the scalar-index path) coerces by unwrapping its underlying value.
ScalarValue::Dictionary(_, inner) => safe_coerce_scalar(inner, ty),
_ => None,
}
}
Expand Down Expand Up @@ -770,9 +783,78 @@
assert_eq!(
safe_coerce_scalar(
&ScalarValue::BinaryView(Some(vec![1, 2, 3])),
&DataType::BinaryView

Check warning on line 786 in rust/lance-datafusion/src/expr.rs

View workflow job for this annotation

GitHub Actions / format

Diff in /home/runner/work/lance/lance/rust/lance-datafusion/src/expr.rs
),
Some(ScalarValue::BinaryView(Some(vec![1, 2, 3])))
);
}

#[test]
fn test_dictionary_coerce() {
let dict_ty = DataType::Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8));

// A string literal coerces to a dictionary target by wrapping the
// coerced value in a dictionary scalar.
assert_eq!(
safe_coerce_scalar(&ScalarValue::Utf8(Some("com".to_string())), &dict_ty),
Some(ScalarValue::Dictionary(
Box::new(DataType::Int16),
Box::new(ScalarValue::Utf8(Some("com".to_string()))),
))
);

// The inner value is coerced through to the dictionary value type, so a
// LargeUtf8 literal lands as a Utf8 value inside the dictionary.
assert_eq!(
safe_coerce_scalar(&ScalarValue::LargeUtf8(Some("com".to_string())), &dict_ty),
Some(ScalarValue::Dictionary(
Box::new(DataType::Int16),
Box::new(ScalarValue::Utf8(Some("com".to_string()))),
))
);

// A dictionary literal round-trips back to its value type.
assert_eq!(
safe_coerce_scalar(
&ScalarValue::Dictionary(
Box::new(DataType::Int16),
Box::new(ScalarValue::Utf8(Some("com".to_string()))),
),
&DataType::Utf8,
),
Some(ScalarValue::Utf8(Some("com".to_string())))
);

// A dictionary literal coerces to a dictionary target, adopting the
// target's key type.
assert_eq!(
safe_coerce_scalar(
&ScalarValue::Dictionary(
Box::new(DataType::Int32),
Box::new(ScalarValue::Utf8(Some("com".to_string()))),
),
&dict_ty,
),
Some(ScalarValue::Dictionary(
Box::new(DataType::Int16),
Box::new(ScalarValue::Utf8(Some("com".to_string()))),
))
);

// Null literals keep their untyped form, matching the behavior for all
// other target types.
assert_eq!(
safe_coerce_scalar(&ScalarValue::Utf8(None), &dict_ty),
Some(ScalarValue::Utf8(None))
);

// A value that cannot be coerced to the dictionary value type fails.
assert_eq!(
safe_coerce_scalar(
&ScalarValue::Utf8(Some("com".to_string())),
&DataType::Dictionary(Box::new(DataType::Int16), Box::new(DataType::Int32)),
),
None
);
}
}
33 changes: 32 additions & 1 deletion rust/lance-index/src/scalar/btree.rs
Original file line number Diff line number Diff line change
Expand Up @@ -581,7 +581,7 @@ impl Ord for OrderableScalarValue {
}
}
(Struct(_arr), _) => panic!("Attempt to compare Struct with non-Struct"),
(Dictionary(_k1, _v1), Dictionary(_k2, _v2)) => todo!(),
(Dictionary(_k1, v1), Dictionary(_k2, v2)) => Self(*v1.clone()).cmp(&Self(*v2.clone())),
(Dictionary(_, v1), Null) => Self(*v1.clone()).cmp(&Self(ScalarValue::Null)),
(Dictionary(_, _), _) => panic!("Attempt to compare Dictionary with non-Dictionary"),
// What would a btree of unions even look like? May not be possible.
Expand Down Expand Up @@ -3036,6 +3036,37 @@ mod tests {
assert!(size_of_many_i32 > 128 * 4);
}

#[test]
fn test_orderable_dictionary_cmp() {
use arrow_schema::DataType;
use std::cmp::Ordering;

let dict = |s: &str, key: DataType| {
OrderableScalarValue(ScalarValue::Dictionary(
Box::new(key),
Box::new(ScalarValue::Utf8(Some(s.to_string()))),
))
};

// Dictionary scalars are ordered by their underlying value, regardless
// of the key type. This is exercised when loading a scalar index built
// on a dictionary-encoded column into a BTreeMap.
assert_eq!(
dict("a", DataType::Int16).cmp(&dict("b", DataType::Int16)),
Ordering::Less
);
assert_eq!(
dict("b", DataType::Int32).cmp(&dict("b", DataType::Int16)),
Ordering::Equal
);

// A non-null dictionary value sorts after null.
assert_eq!(
dict("a", DataType::Int16).cmp(&OrderableScalarValue(ScalarValue::Null)),
Ordering::Greater
);
}

#[tokio::test]
async fn test_null_ids() {
let tmpdir = TempObjDir::default();
Expand Down
87 changes: 87 additions & 0 deletions rust/lance/src/dataset/scanner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8771,6 +8771,93 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\")
);
}

/// Build an in-memory dataset with a single `Dictionary(Int16, Utf8)` column.
/// The dictionary cycles through "a", "b", "c" so each value appears in a
/// predictable, repeated pattern.
async fn dictionary_string_dataset() -> Dataset {
use arrow_array::{Int16Array, Int16DictionaryArray};

let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new(
"etld",
DataType::Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8)),
false,
)]));

let dictionary = Arc::new(StringArray::from(vec!["a", "b", "c"]));
let indices = Int16Array::from((0..30).map(|i| i % 3).collect::<Vec<_>>());
let dict_array = Int16DictionaryArray::try_new(indices, dictionary).unwrap();

let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(dict_array)]).unwrap();
let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone());
Dataset::write(reader, "memory://test_dict_filter", None)
.await
.unwrap()
}

/// Regression test for filtering a dictionary-encoded string column via the
/// SQL string path (`Scanner::filter`). This used to fail to plan with
/// "could not convert to literal of type 'Dictionary(Int16, Utf8)'".
#[tokio::test]
async fn test_filter_on_dictionary_string_column() {
let dataset = dictionary_string_dataset().await;

// Equality predicate.
let count = dataset
.scan()
.filter("etld = 'a'")
.unwrap()
.try_into_batch()
.await
.unwrap()
.num_rows();
assert_eq!(count, 10);

// IN-list predicate.
let count = dataset
.scan()
.filter("etld IN ('a', 'b')")
.unwrap()
.try_into_batch()
.await
.unwrap()
.num_rows();
assert_eq!(count, 20);
}

/// An `IN`/`=` predicate on a dictionary column with a scalar index should be
/// pushed down to the index rather than falling back to a full scan.
#[tokio::test]
async fn test_dictionary_string_column_uses_scalar_index() {
use lance_index::scalar::BuiltinIndexType;

let mut dataset = dictionary_string_dataset().await;
let params = ScalarIndexParams::for_builtin(BuiltinIndexType::Bitmap);
dataset
.create_index(&["etld"], IndexType::Scalar, None, &params, true)
.await
.unwrap();

let mut scanner = dataset.scan();
scanner.filter("etld IN ('a', 'b')").unwrap();
let plan = scanner.create_plan().await.unwrap();
let plan_str = format!("{:?}", plan);
assert!(
plan_str.contains("ScalarIndexExec") || plan_str.contains("MaterializeIndex"),
"IN on a dictionary column should use the scalar index, but got: {}",
plan_str
);

let count = dataset
.scan()
.filter("etld IN ('a', 'b')")
.unwrap()
.try_into_batch()
.await
.unwrap()
.num_rows();
assert_eq!(count, 20);
}

#[tokio::test]
async fn test_like_prefix_with_segmented_zone_map() {
use lance_index::scalar::BuiltinIndexType;
Expand Down
Loading