-
Notifications
You must be signed in to change notification settings - Fork 680
fix: schema isn't expected for IVF_PQ #3606
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -152,6 +152,18 @@ impl ProductQuantizationStorage { | |
| distance_type: DistanceType, | ||
| transposed: bool, | ||
| ) -> Result<Self> { | ||
| if batch.num_columns() != 2 { | ||
| log::warn!( | ||
| "PQ storage should have 2 columns, but got {} columns: {}", | ||
| batch.num_columns(), | ||
| batch.schema(), | ||
| ); | ||
| batch = batch.project(&[ | ||
| batch.schema().index_of(ROW_ID)?, | ||
| batch.schema().index_of(PQ_CODE_COLUMN)?, | ||
| ])?; | ||
| } | ||
|
|
||
| let Some(row_ids) = batch.column_by_name(ROW_ID) else { | ||
| return Err(Error::Index { | ||
| message: "Row ID column not found from PQ storage".to_string(), | ||
|
|
@@ -966,7 +978,7 @@ mod tests { | |
|
|
||
| use super::*; | ||
|
|
||
| use arrow_array::Float32Array; | ||
| use arrow_array::{Float32Array, UInt32Array}; | ||
| use arrow_schema::{DataType, Field, Schema as ArrowSchema}; | ||
| use lance_arrow::FixedSizeListArrayExt; | ||
| use lance_core::datatypes::Schema; | ||
|
|
@@ -1005,6 +1017,40 @@ mod tests { | |
| .unwrap() | ||
| } | ||
|
|
||
| async fn create_pq_storage_with_extra_column() -> ProductQuantizationStorage { | ||
| let codebook = Float32Array::from_iter_values((0..256 * DIM).map(|_| rand::random())); | ||
| let codebook = FixedSizeListArray::try_new_from_values(codebook, DIM as i32).unwrap(); | ||
| let pq = ProductQuantizer::new(NUM_SUB_VECTORS, 8, DIM, codebook, DistanceType::Dot); | ||
|
|
||
| let schema = ArrowSchema::new(vec![ | ||
| Field::new( | ||
| "vec", | ||
| DataType::FixedSizeList( | ||
| Field::new_list_field(DataType::Float32, true).into(), | ||
| DIM as i32, | ||
| ), | ||
| true, | ||
| ), | ||
| ROW_ID_FIELD.clone(), | ||
| Field::new("extra", DataType::UInt32, true), | ||
| ]); | ||
| let vectors = Float32Array::from_iter_values((0..TOTAL * DIM).map(|_| rand::random())); | ||
| let row_ids = UInt64Array::from_iter_values((0..TOTAL).map(|v| v as u64)); | ||
| let extra_column = UInt32Array::from_iter_values((0..TOTAL).map(|v| v as u32)); | ||
| let fsl = FixedSizeListArray::try_new_from_values(vectors, DIM as i32).unwrap(); | ||
| let batch = RecordBatch::try_new( | ||
| schema.into(), | ||
| vec![Arc::new(fsl), Arc::new(row_ids), Arc::new(extra_column)], | ||
| ) | ||
| .unwrap(); | ||
|
|
||
| StorageBuilder::new("vec".to_owned(), pq.distance_type, pq) | ||
| .unwrap() | ||
| .assert_num_columns(false) | ||
| .build(vec![batch]) | ||
| .unwrap() | ||
| } | ||
|
|
||
| #[tokio::test] | ||
| async fn test_build_pq_storage() { | ||
| let storage = create_pq_storage().await; | ||
|
|
@@ -1062,4 +1108,25 @@ mod tests { | |
| let dist2 = storage.dist_between(v, u); | ||
| assert_eq!(dist1, dist2); | ||
| } | ||
|
|
||
| #[tokio::test] | ||
| async fn test_remap_with_extra_column() { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this because some old indices will have this extra column and we need to make sure they are supported?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. right, we saw some feedbacks about this, so add this test to make sure the old indices could work with this fix |
||
| let storage = create_pq_storage_with_extra_column().await; | ||
| let mut mapping = HashMap::new(); | ||
| for i in 0..TOTAL / 2 { | ||
| mapping.insert(i as u64, Some((TOTAL + i) as u64)); | ||
| } | ||
| for i in TOTAL / 2..TOTAL { | ||
| mapping.insert(i as u64, None); | ||
| } | ||
| let new_storage = storage.remap(&mapping).unwrap(); | ||
| assert_eq!(new_storage.len(), TOTAL / 2); | ||
| assert_eq!(new_storage.row_ids.len(), TOTAL / 2); | ||
| for (i, row_id) in new_storage.row_ids().enumerate() { | ||
| assert_eq!(*row_id, (TOTAL + i) as u64); | ||
| } | ||
| assert_eq!(new_storage.batch.num_columns(), 2); | ||
| assert!(new_storage.batch.column_by_name(ROW_ID).is_some()); | ||
| assert!(new_storage.batch.column_by_name(PQ_CODE_COLUMN).is_some()); | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we still get PQ storage with an extra column from a real workflow? Or is this just generating some kind of invalid input for testing?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it's just for testing, we shouldn't see any extra column in real workflow