From d6b6c98a1ea8bc82f3752afa01abcdef5558a95b Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Wed, 19 Apr 2023 10:31:46 -0700 Subject: [PATCH 01/16] refactor --- rust/src/dataset.rs | 110 ++-------------- rust/src/dataset/scanner.rs | 2 +- rust/src/index.rs | 97 +++++++++++++- rust/src/index/vector.rs | 136 ++++++++++++++------ rust/src/index/vector/diskann/builder.rs | 11 +- rust/src/index/vector/diskann/row_vertex.rs | 2 +- rust/src/index/vector/ivf.rs | 49 +++++-- rust/src/index/vector/opq.rs | 1 + rust/src/index/vector/pq.rs | 9 +- 9 files changed, 250 insertions(+), 167 deletions(-) diff --git a/rust/src/dataset.rs b/rust/src/dataset.rs index 205a767caaf..0bf8bfd129b 100644 --- a/rust/src/dataset.rs +++ b/rust/src/dataset.rs @@ -22,7 +22,7 @@ use std::time::SystemTime; use arrow_array::{ cast::as_struct_array, RecordBatch, RecordBatchReader, StructArray, UInt64Array, }; -use arrow_schema::{DataType, Schema as ArrowSchema}; +use arrow_schema::Schema as ArrowSchema; use arrow_select::{concat::concat_batches, take::take}; use chrono::prelude::*; use futures::stream::{self, StreamExt, TryStreamExt}; @@ -38,9 +38,6 @@ use self::scanner::Scanner; use crate::arrow::*; use crate::datatypes::Schema; use crate::format::{pb, Fragment, Index, Manifest}; -use crate::index::vector::ivf::{build_ivf_pq_index, IvfBuildParams}; -use crate::index::vector::pq::PQBuildParams; -use crate::index::{vector::VectorIndexParams, IndexParams, IndexType}; use crate::io::{ object_reader::{read_message, read_struct}, read_manifest, read_metadata_offset, write_manifest, FileWriter, ObjectStore, @@ -105,7 +102,7 @@ fn manifest_path(base: &Path, version: u64) -> Path { } /// Get the latest manifest path -fn latest_manifest_path(base: &Path) -> Path { +pub(crate) fn latest_manifest_path(base: &Path) -> Path { base.child(LATEST_MANIFEST_NAME) } @@ -357,98 +354,6 @@ impl Dataset { Ok(counts.iter().sum()) } - /// Create indices on columns. - /// - /// Upon finish, a new dataset version is generated. - /// - /// Parameters: - /// - /// - `columns`: the columns to build the indices on. - /// - `index_type`: specify [`IndexType`]. - /// - `name`: optional index name. Must be unique in the dataset. - /// if not provided, it will auto-generate one. - /// - `params`: index parameters. - pub async fn create_index( - &self, - columns: &[&str], - index_type: IndexType, - name: Option, - params: &dyn IndexParams, - ) -> Result { - if columns.len() != 1 { - return Err(Error::Index( - "Only support building index on 1 column at the moment".to_string(), - )); - } - let column = columns[0]; - let Some(field) = self.schema().field(column) else { - return Err(Error::Index(format!( - "CreateIndex: column '{column}' does not exist" - ))); - }; - - // Load indices from the disk. - let mut indices = self.load_indices().await?; - - let index_name = name.unwrap_or(format!("{column}_idx")); - if indices.iter().any(|i| i.name == index_name) { - return Err(Error::Index(format!( - "Index name '{index_name} already exists'" - ))); - } - - let index_id = Uuid::new_v4(); - match index_type { - IndexType::Vector => { - let vec_params = params - .as_any() - .downcast_ref::() - .ok_or_else(|| { - Error::Index("Vector index type must take a VectorIndexParams".to_string()) - })?; - - let ivf_params = IvfBuildParams { - num_partitions: vec_params.num_partitions as usize, - metric_type: vec_params.metric_type, - max_iters: vec_params.max_iterations, - }; - let pq_params = PQBuildParams { - num_sub_vectors: vec_params.num_sub_vectors as usize, - num_bits: 8, - metric_type: vec_params.metric_type, - max_iters: vec_params.max_iterations, - use_opq: vec_params.use_opq, - max_opq_iters: vec_params.max_opq_iterations, - }; - build_ivf_pq_index( - self, - column, - &index_name, - &index_id, - &ivf_params, - &pq_params, - ) - .await? - } - } - - let latest_manifest = self.latest_manifest().await?; - let mut new_manifest = self.manifest.as_ref().clone(); - new_manifest.version = latest_manifest.version + 1; - - // Write index metadata down - let new_idx = Index::new(index_id, &index_name, &[field.id], new_manifest.version); - indices.push(new_idx); - - write_manifest_file(&self.object_store, &mut new_manifest, Some(indices)).await?; - - Ok(Self { - object_store: self.object_store.clone(), - base: self.base.clone(), - manifest: Arc::new(new_manifest), - }) - } - pub async fn take(&self, row_indices: &[usize], projection: &Schema) -> Result { let mut sorted_indices: Vec = Vec::from_iter(row_indices.iter().map(|indice| *indice as u32)); @@ -572,7 +477,7 @@ impl Dataset { latest_manifest_path(&self.base) } - async fn latest_manifest(&self) -> Result { + pub(crate) async fn latest_manifest(&self) -> Result { read_manifest(&self.object_store, &self.latest_manifest_path()).await } @@ -660,7 +565,7 @@ impl Dataset { /// Finish writing the manifest file, and commit the changes by linking the latest manifest file /// to this version. -async fn write_manifest_file( +pub(crate) async fn write_manifest_file( object_store: &ObjectStore, manifest: &mut Manifest, indices: Option>, @@ -692,6 +597,9 @@ async fn write_manifest_file_to_path( #[cfg(test)] mod tests { use super::*; + use crate::index::IndexType; + use crate::index::vector::MetricType; + use crate::index::{DatasetIndexExt, vector::VectorIndexParams}; use crate::{datatypes::Schema, utils::testing::generate_random_array}; use crate::dataset::WriteMode::Overwrite; @@ -1129,9 +1037,7 @@ mod tests { let dataset = Dataset::write(&mut reader, test_uri, None).await.unwrap(); // Make sure valid arguments should create index successfully - let mut params = VectorIndexParams::default(); - params.num_partitions = 10; - params.num_sub_vectors = 2; + let params = VectorIndexParams::ivf_pq(10, 8, 2, false, MetricType::L2, 50); let dataset = dataset .create_index(&["embeddings"], IndexType::Vector, None, ¶ms) .await diff --git a/rust/src/dataset/scanner.rs b/rust/src/dataset/scanner.rs index da94e912d7d..4edaf6bd5b9 100644 --- a/rust/src/dataset/scanner.rs +++ b/rust/src/dataset/scanner.rs @@ -590,7 +590,7 @@ mod test { use super::*; use crate::arrow::*; - use crate::index::{vector::VectorIndexParams, IndexType}; + use crate::index::{DatasetIndexExt, {vector::VectorIndexParams, IndexType}}; use crate::{arrow::RecordBatchBuffer, dataset::WriteParams}; #[tokio::test] diff --git a/rust/src/index.rs b/rust/src/index.rs index 31ef5f542d5..abb96e29e76 100644 --- a/rust/src/index.rs +++ b/rust/src/index.rs @@ -20,8 +20,10 @@ use std::any::Any; use std::fmt; +use std::sync::Arc; use async_trait::async_trait; +use uuid::Uuid; /// Protobuf definitions for the index on-disk format. pub mod pb { @@ -31,7 +33,11 @@ pub mod pb { pub mod vector; -use crate::Result; +use crate::dataset::write_manifest_file; +use crate::format::Index; +use crate::{dataset::Dataset, Error, Result}; + +use self::vector::{build_vector_index, VectorIndexParams}; /// Index Type pub enum IndexType { @@ -58,6 +64,93 @@ pub trait IndexBuilder { async fn build(&self) -> Result<()>; } -pub trait IndexParams { +pub trait IndexParams: Send + Sync { fn as_any(&self) -> &dyn Any; } + +/// Extends Dataset with secondary index. +#[async_trait] +pub(crate) trait DatasetIndexExt { + /// Create indices on columns. + /// + /// Upon finish, a new dataset version is generated. + /// + /// Parameters: + /// + /// - `columns`: the columns to build the indices on. + /// - `index_type`: specify [`IndexType`]. + /// - `name`: optional index name. Must be unique in the dataset. + /// if not provided, it will auto-generate one. + /// - `params`: index parameters. + async fn create_index( + &self, + columns: &[&str], + index_type: IndexType, + name: Option, + params: &dyn IndexParams, + ) -> Result; +} + +#[async_trait] +impl DatasetIndexExt for Dataset { + async fn create_index( + &self, + columns: &[&str], + index_type: IndexType, + name: Option, + params: &dyn IndexParams, + ) -> Result { + if columns.len() != 1 { + return Err(Error::Index( + "Only support building index on 1 column at the moment".to_string(), + )); + } + let column = columns[0]; + let Some(field) = self.schema().field(column) else { + return Err(Error::Index(format!( + "CreateIndex: column '{column}' does not exist" + ))); + }; + + // Load indices from the disk. + let mut indices = self.load_indices().await?; + + let index_name = name.unwrap_or(format!("{column}_idx")); + if indices.iter().any(|i| i.name == index_name) { + return Err(Error::Index(format!( + "Index name '{index_name} already exists'" + ))); + } + + let index_id = Uuid::new_v4(); + match index_type { + IndexType::Vector => { + // Vector index params. + let vec_params = params + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::Index("Vector index type must take a VectorIndexParams".to_string()) + })?; + + build_vector_index(self, column, &index_name, &index_id, &vec_params).await?; + } + } + + let latest_manifest = self.latest_manifest().await?; + let mut new_manifest = self.manifest.as_ref().clone(); + new_manifest.version = latest_manifest.version + 1; + + // Write index metadata down + let new_idx = Index::new(index_id, &index_name, &[field.id], new_manifest.version); + indices.push(new_idx); + + write_manifest_file(&self.object_store, &mut new_manifest, Some(indices)).await?; + + Ok(Self { + object_store: self.object_store.clone(), + base: self.base.clone(), + manifest: Arc::new(new_manifest), + }) + } +} diff --git a/rust/src/index/vector.rs b/rust/src/index/vector.rs index f3b29757ee7..a0ef669bd0d 100644 --- a/rust/src/index/vector.rs +++ b/rust/src/index/vector.rs @@ -32,7 +32,10 @@ pub mod pq; mod traits; mod utils; -use self::{ivf::IVFIndex, pq::PQIndex}; +use self::{ + ivf::{build_ivf_pq_index, IVFIndex, IvfBuildParams}, + pq::{PQBuildParams, PQIndex}, +}; use super::{pb, IndexParams}; use crate::{ @@ -159,29 +162,19 @@ impl TryFrom<&str> for MetricType { } } +pub trait VertexIndexStageParams: std::fmt::Debug + Send + Sync { + fn as_any(&self) -> &dyn Any; +} + /// The parameters to build vector index. pub struct VectorIndexParams { - // This is hard coded for IVF_PQ for now. Can refactor later to support more. - /// The number of IVF partitions - pub num_partitions: u32, - - /// the number of bits to present the centroids used in PQ. - pub nbits: u8, - - /// Use Optimized Product Quantizer. - pub use_opq: bool, - - /// the number of sub vectors used in PQ. - pub num_sub_vectors: u32, + pub stages: Vec>, /// Vector distance metrics type. pub metric_type: MetricType, /// Max number of iterations to train a KMean model pub max_iterations: usize, - - /// Max number of iterations to train a OPQ model. - pub max_opq_iterations: usize, } impl VectorIndexParams { @@ -190,25 +183,34 @@ impl VectorIndexParams { /// Parameters /// /// - `num_partitions`: the number of IVF partitions. - /// - `nbits`: the number of bits to present the centroids used in PQ. Can only be `8` for now. + /// - `num_bits`: the number of bits to present the centroids used in PQ. Can only be `8` for now. /// - `num_sub_vectors`: the number of sub vectors used in PQ. /// - `metric_type`: how to compute distance, i.e., `L2` or `Cosine`. pub fn ivf_pq( - num_partitions: u32, - nbits: u8, - num_sub_vectors: u32, + num_partitions: usize, + num_bits: u8, + num_sub_vectors: usize, use_opq: bool, metric_type: MetricType, max_iterations: usize, ) -> Self { + let mut stages: Vec> = vec![]; + if use_opq {}; + + stages.push(Box::new(IvfBuildParams::new(num_partitions))); + let mut pq_params = PQBuildParams::default(); + pq_params.num_bits = num_bits as usize; + pq_params.num_sub_vectors = num_sub_vectors as usize; + pq_params.use_opq = use_opq; + pq_params.metric_type = metric_type; + pq_params.max_iters = max_iterations; + pq_params.max_opq_iters = max_iterations; + + stages.push(Box::new(pq_params)); Self { - num_partitions, - nbits, - num_sub_vectors, - use_opq, + stages, metric_type, max_iterations, - max_opq_iterations: max_iterations, } } @@ -218,28 +220,82 @@ impl VectorIndexParams { } } -impl Default for VectorIndexParams { - fn default() -> Self { - Self { - num_partitions: 32, - nbits: 8, - num_sub_vectors: 16, - use_opq: false, - metric_type: MetricType::L2, - max_iterations: MAX_ITERATIONS, // Faiss - max_opq_iterations: MAX_OPQ_ITERATIONS, - } - } -} - impl IndexParams for VectorIndexParams { fn as_any(&self) -> &dyn Any { self } } +fn is_ivf_pq(stages: &[Box]) -> bool { + if stages.len() < 2 { + return false; + } + let len = stages.len(); + + matches!(&stages[len - 1], PQBuildParams) && matches!(&stages[len - 2], IvfBuildParams) +} + +fn is_diskann(stages: &[Box]) -> bool { + if stages.is_empty() { + return false; + } + matches!(stages.last().unwrap().as_ref(), DiskANNBuildParams) +} + +/// Build a Vector Index +pub(crate) async fn build_vector_index( + dataset: &Dataset, + column: &str, + name: &str, + uuid: &uuid::Uuid, + params: &VectorIndexParams, +) -> Result<()> { + let stages = ¶ms.stages; + + if stages.is_empty() { + return Err(Error::Index( + "Build Vector Index: must have at least 1 stage".to_string(), + )); + }; + + if is_ivf_pq(stages) { + // This is a IVF PQ index. + let len = stages.len(); + let ivf_params = stages[len - 2] + .as_ref() + .as_any() + .downcast_ref::() + .unwrap(); + let pq_params = stages[len - 1] + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::Index(format!("Build Vector Index: invalid stages: {:?}", stages)) + })?; + build_ivf_pq_index( + dataset, + column, + &name, + &uuid, + params.metric_type, + &ivf_params, + &pq_params, + ) + .await? + } else if is_diskann(stages) { + // This is DiskANN index. + } else { + return Err(Error::Index(format!( + "Build Vector Index: invalid stages: {:?}", + stages + ))); + } + + Ok(()) +} + /// Open the Vector index on dataset, specified by the `uuid`. -pub async fn open_index(dataset: &Dataset, uuid: &str) -> Result> { +pub(crate) async fn open_index(dataset: &Dataset, uuid: &str) -> Result> { let index_dir = dataset.indices_dir().child(uuid); let index_file = index_dir.child(INDEX_FILE_NAME); diff --git a/rust/src/index/vector/diskann/builder.rs b/rust/src/index/vector/diskann/builder.rs index 77d7c6f01b3..094158497f5 100644 --- a/rust/src/index/vector/diskann/builder.rs +++ b/rust/src/index/vector/diskann/builder.rs @@ -26,19 +26,17 @@ use rand::Rng; use crate::arrow::{linalg::MatrixView, *}; use crate::dataset::{Dataset, ROW_ID}; use crate::index::vector::diskann::row_vertex::RowVertexSerDe; -use crate::index::vector::diskann::{DiskANNParams, PQVertexSerDe}; +use crate::index::vector::diskann::DiskANNParams; use crate::index::vector::graph::{ builder::GraphBuilder, write_graph, VertexWithDistance, WriteGraphParams, }; use crate::index::vector::graph::{Graph, Vertex}; -use crate::index::vector::pq::{train_pq, ProductQuantizer}; -use crate::index::vector::utils::maybe_sample_training_data; use crate::index::vector::MetricType; use crate::linalg::l2::l2_distance; use crate::{Error, Result}; use super::row_vertex::RowVertex; -use super::{search::greedy_search, PQVertex}; +use super::search::greedy_search; /// Builder for DiskANN index. pub struct Builder { @@ -64,11 +62,6 @@ impl Builder { /// Build the index. pub async fn build(&self) -> Result<()> { - // Step 1: train PQ codebook. - let sample_size = 1000; - let training_data = - maybe_sample_training_data(self.dataset.as_ref(), &self.column, sample_size).await?; - // Randomly initialize the graph with r random neighbors for each vertex. let mut graph = init_graph( self.dataset.clone(), diff --git a/rust/src/index/vector/diskann/row_vertex.rs b/rust/src/index/vector/diskann/row_vertex.rs index 36d4bf1500b..ce63a5a978d 100644 --- a/rust/src/index/vector/diskann/row_vertex.rs +++ b/rust/src/index/vector/diskann/row_vertex.rs @@ -1,4 +1,3 @@ -use arrow_array::Float32Array; // Copyright 2023 Lance Developers. // // Licensed under the Apache License, Version 2.0 (the "License"); @@ -13,6 +12,7 @@ use arrow_array::Float32Array; // See the License for the specific language governing permissions and // limitations under the License. +use arrow_array::Float32Array; use byteorder::{ByteOrder, LE}; use super::{Vertex, VertexSerDe}; diff --git a/rust/src/index/vector/ivf.rs b/rust/src/index/vector/ivf.rs index f9d40af236c..7caa7769029 100644 --- a/rust/src/index/vector/ivf.rs +++ b/rust/src/index/vector/ivf.rs @@ -39,7 +39,7 @@ use super::{ opq::train_opq, pq::{train_pq, PQBuildParams, ProductQuantizer}, utils::maybe_sample_training_data, - MetricType, Query, VectorIndex, INDEX_FILE_NAME, + MetricType, Query, VectorIndex, VertexIndexStageParams, INDEX_FILE_NAME, }; use crate::io::object_reader::ObjectReader; use crate::{ @@ -420,18 +420,41 @@ fn sanity_check(dataset: &Dataset, column: &str) -> Result<()> { } /// Parameters to build IVF partitions +#[derive(Debug, Clone)] pub struct IvfBuildParams { /// Number of partitions to build. pub num_partitions: usize, - /// Metric type, L2 or Cosine. - pub metric_type: MetricType, - // ---- kmeans parameters /// Max number of iterations to train kmeans. pub max_iters: usize, } +impl Default for IvfBuildParams { + fn default() -> Self { + Self { + num_partitions: 32, + max_iters: 50, + } + } +} + +impl VertexIndexStageParams for IvfBuildParams { + fn as_any(&self) -> &dyn Any { + self + } +} + +impl IvfBuildParams { + /// Create a new instance of `IvfBuildParams`. + pub fn new(num_partitions: usize) -> Self { + Self { + num_partitions, + ..Default::default() + } + } +} + /// Compute residual matrix. /// /// Parameters @@ -472,6 +495,7 @@ pub async fn build_ivf_pq_index( column: &str, index_name: &str, uuid: &Uuid, + metric_type: MetricType, ivf_params: &IvfBuildParams, pq_params: &PQBuildParams, ) -> Result<()> { @@ -480,7 +504,7 @@ pub async fn build_ivf_pq_index( ivf_params.num_partitions, if pq_params.use_opq { "O" } else { "" }, pq_params.num_sub_vectors, - ivf_params.metric_type, + metric_type, ); sanity_check(dataset, column)?; @@ -506,12 +530,11 @@ pub async fn build_ivf_pq_index( } // Train IVF partitions. - let ivf_model = train_ivf_model(&training_data, ivf_params).await?; + let ivf_model = train_ivf_model(&training_data, metric_type, ivf_params).await?; // Compute the residual vector for training PQ let ivf_centroids = ivf_model.centroids.as_ref().try_into()?; - let residual_data = - compute_residual_matrix(&training_data, &ivf_centroids, ivf_params.metric_type)?; + let residual_data = compute_residual_matrix(&training_data, &ivf_centroids, metric_type)?; let pq_training_data = MatrixView::new(residual_data, training_data.num_columns()); // The final train of PQ sub-vectors @@ -596,7 +619,7 @@ pub async fn build_ivf_pq_index( &transforms, ivf_model, pq, - ivf_params.metric_type, + metric_type, &batches, ) .await @@ -672,7 +695,11 @@ async fn write_index_file( } /// Train IVF partitions using kmeans. -async fn train_ivf_model(data: &MatrixView, params: &IvfBuildParams) -> Result { +async fn train_ivf_model( + data: &MatrixView, + metric_type: MetricType, + params: &IvfBuildParams, +) -> Result { let rng = SmallRng::from_entropy(); const REDOS: usize = 1; let centroids = super::kmeans::train_kmeans( @@ -683,7 +710,7 @@ async fn train_ivf_model(data: &MatrixView, params: &IvfBuildParams) -> Result Vec &dyn Any { + self + } +} + /// Train product quantization over (OPQ-rotated) residual vectors. pub(crate) async fn train_pq( data: &MatrixView, From 9a6e1583352af846213c1e5acb0b6e136732b019 Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Wed, 19 Apr 2023 10:45:52 -0700 Subject: [PATCH 02/16] build diskann params --- rust/src/dataset.rs | 4 ++-- rust/src/dataset/scanner.rs | 5 ++++- rust/src/index/vector.rs | 19 +++++++------------ rust/src/index/vector/diskann.rs | 9 ++++++++- 4 files changed, 21 insertions(+), 16 deletions(-) diff --git a/rust/src/dataset.rs b/rust/src/dataset.rs index 0bf8bfd129b..f4976eec2f4 100644 --- a/rust/src/dataset.rs +++ b/rust/src/dataset.rs @@ -597,9 +597,9 @@ async fn write_manifest_file_to_path( #[cfg(test)] mod tests { use super::*; - use crate::index::IndexType; use crate::index::vector::MetricType; - use crate::index::{DatasetIndexExt, vector::VectorIndexParams}; + use crate::index::IndexType; + use crate::index::{vector::VectorIndexParams, DatasetIndexExt}; use crate::{datatypes::Schema, utils::testing::generate_random_array}; use crate::dataset::WriteMode::Overwrite; diff --git a/rust/src/dataset/scanner.rs b/rust/src/dataset/scanner.rs index 4edaf6bd5b9..c08b7b1e345 100644 --- a/rust/src/dataset/scanner.rs +++ b/rust/src/dataset/scanner.rs @@ -590,7 +590,10 @@ mod test { use super::*; use crate::arrow::*; - use crate::index::{DatasetIndexExt, {vector::VectorIndexParams, IndexType}}; + use crate::index::{ + DatasetIndexExt, + {vector::VectorIndexParams, IndexType}, + }; use crate::{arrow::RecordBatchBuffer, dataset::WriteParams}; #[tokio::test] diff --git a/rust/src/index/vector.rs b/rust/src/index/vector.rs index a0ef669bd0d..4f4c38877e4 100644 --- a/rust/src/index/vector.rs +++ b/rust/src/index/vector.rs @@ -43,6 +43,7 @@ use crate::{ index::{ pb::vector_index_stage::Stage, vector::{ + diskann::DiskANNParams, ivf::Ivf, opq::{OPQIndex, OptimizedProductQuantizer}, pq::ProductQuantizer, @@ -60,10 +61,6 @@ use crate::{ }; pub use traits::*; -const MAX_ITERATIONS: usize = 50; -/// Maximum number of iterations for OPQ. -/// See OPQ paper for details. -const MAX_OPQ_ITERATIONS: usize = 100; pub(crate) const SCORE_COL: &str = "score"; const INDEX_FILE_NAME: &str = "index.idx"; @@ -172,9 +169,6 @@ pub struct VectorIndexParams { /// Vector distance metrics type. pub metric_type: MetricType, - - /// Max number of iterations to train a KMean model - pub max_iterations: usize, } impl VectorIndexParams { @@ -195,8 +189,6 @@ impl VectorIndexParams { max_iterations: usize, ) -> Self { let mut stages: Vec> = vec![]; - if use_opq {}; - stages.push(Box::new(IvfBuildParams::new(num_partitions))); let mut pq_params = PQBuildParams::default(); pq_params.num_bits = num_bits as usize; @@ -210,13 +202,16 @@ impl VectorIndexParams { Self { stages, metric_type, - max_iterations, } } /// Create index parameters for `DiskANN` index. - pub fn diskann() -> Self { - todo!("DiskANN is not supported yet") + pub fn diskann(r: usize, alpha: f32, l: usize, metric_type: MetricType) -> Self { + let stage = Box::new(DiskANNParams::new(r, alpha, l)); + Self { + stages: vec![stage], + metric_type, + } } } diff --git a/rust/src/index/vector/diskann.rs b/rust/src/index/vector/diskann.rs index 4969d0a5a08..369e366c428 100644 --- a/rust/src/index/vector/diskann.rs +++ b/rust/src/index/vector/diskann.rs @@ -24,11 +24,12 @@ use byteorder::{ByteOrder, LE}; use super::{ graph::{Vertex, VertexSerDe}, - MetricType, + MetricType, VertexIndexStageParams, }; use crate::index::vector::pq::PQBuildParams; use crate::Result; +#[derive(Clone, Debug)] pub struct DiskANNParams { /// out-degree bound (R) pub r: usize, @@ -107,6 +108,12 @@ impl DiskANNParams { } } +impl VertexIndexStageParams for DiskANNParams { + fn as_any(&self) -> &dyn std::any::Any { + self + } +} + /// DiskANN Vertex with PQ as persisted data. #[derive(Clone, Debug)] pub(crate) struct PQVertex { From 33496387dc3202e069de6c1aa3db7b441172285a Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Wed, 19 Apr 2023 12:04:19 -0700 Subject: [PATCH 03/16] compilable --- rust/src/index.rs | 9 +- rust/src/index/vector.rs | 18 ++- rust/src/index/vector/diskann.rs | 1 + rust/src/index/vector/diskann/builder.rs | 141 +++++++++-------------- rust/src/index/vector/graph/builder.rs | 2 +- rust/src/index/vector/ivf.rs | 10 +- 6 files changed, 83 insertions(+), 98 deletions(-) diff --git a/rust/src/index.rs b/rust/src/index.rs index abb96e29e76..601248114a5 100644 --- a/rust/src/index.rs +++ b/rust/src/index.rs @@ -133,7 +133,14 @@ impl DatasetIndexExt for Dataset { Error::Index("Vector index type must take a VectorIndexParams".to_string()) })?; - build_vector_index(self, column, &index_name, &index_id, &vec_params).await?; + build_vector_index( + self, + column, + &index_name, + &index_id.to_string(), + &vec_params, + ) + .await?; } } diff --git a/rust/src/index/vector.rs b/rust/src/index/vector.rs index 4f4c38877e4..3ac0f7ec592 100644 --- a/rust/src/index/vector.rs +++ b/rust/src/index/vector.rs @@ -101,14 +101,15 @@ impl MetricType { /// Compute the distance from one vector to a batch of vectors. pub fn batch_func( &self, - ) -> Arc Arc + Send + Sync> { + ) -> Arc Arc + Send + Sync + 'static> { match self { Self::L2 => Arc::new(l2_distance_batch), Self::Cosine => Arc::new(cosine_distance_batch), } } + /// Returns the distance function between two vectors. - pub fn func(&self) -> Arc f32> { + pub fn func(&self) -> Arc f32 + Send + Sync + 'static> { match self { Self::L2 => Arc::new(l2_distance), Self::Cosine => Arc::new(cosine_distance), @@ -234,7 +235,7 @@ fn is_diskann(stages: &[Box]) -> bool { if stages.is_empty() { return false; } - matches!(stages.last().unwrap().as_ref(), DiskANNBuildParams) + matches!(stages.last().unwrap().as_ref(), DiskANNParams) } /// Build a Vector Index @@ -242,7 +243,7 @@ pub(crate) async fn build_vector_index( dataset: &Dataset, column: &str, name: &str, - uuid: &uuid::Uuid, + uuid: &str, params: &VectorIndexParams, ) -> Result<()> { let stages = ¶ms.stages; @@ -279,6 +280,15 @@ pub(crate) async fn build_vector_index( .await? } else if is_diskann(stages) { // This is DiskANN index. + use self::diskann::build_diskann_index; + let params = stages + .last() + .unwrap() + .as_ref() + .as_any() + .downcast_ref::() + .unwrap(); + build_diskann_index(dataset, column, uuid, params.clone()).await?; } else { return Err(Error::Index(format!( "Build Vector Index: invalid stages: {:?}", diff --git a/rust/src/index/vector/diskann.rs b/rust/src/index/vector/diskann.rs index 369e366c428..fac01225fe4 100644 --- a/rust/src/index/vector/diskann.rs +++ b/rust/src/index/vector/diskann.rs @@ -28,6 +28,7 @@ use super::{ }; use crate::index::vector::pq::PQBuildParams; use crate::Result; +pub(crate) use builder::build_diskann_index; #[derive(Clone, Debug)] pub struct DiskANNParams { diff --git a/rust/src/index/vector/diskann/builder.rs b/rust/src/index/vector/diskann/builder.rs index 094158497f5..4b40cbcf046 100644 --- a/rust/src/index/vector/diskann/builder.rs +++ b/rust/src/index/vector/diskann/builder.rs @@ -13,7 +13,6 @@ // limitations under the License. use std::collections::{BinaryHeap, HashSet}; -use std::sync::Arc; use arrow_array::{cast::as_primitive_array, types::UInt64Type}; use arrow_select::concat::concat_batches; @@ -21,7 +20,7 @@ use futures::stream::{self, StreamExt, TryStreamExt}; use ordered_float::OrderedFloat; use rand::distributions::Uniform; use rand::prelude::SliceRandom; -use rand::Rng; +use rand::{Rng, SeedableRng}; use crate::arrow::{linalg::MatrixView, *}; use crate::dataset::{Dataset, ROW_ID}; @@ -38,89 +37,59 @@ use crate::{Error, Result}; use super::row_vertex::RowVertex; use super::search::greedy_search; -/// Builder for DiskANN index. -pub struct Builder { - dataset: Arc, - - column: String, - - uuid: String, - +pub(crate) async fn build_diskann_index( + dataset: &Dataset, + column: &str, + uuid: &str, params: DiskANNParams, -} - -impl Builder { - /// Create a [`Builder`] to build DiskANN index. - pub fn new(dataset: Arc, column: &str, uuid: &str, params: DiskANNParams) -> Self { - Self { - dataset, - column: column.to_string(), - uuid: uuid.to_string(), - params, - } - } - - /// Build the index. - pub async fn build(&self) -> Result<()> { - // Randomly initialize the graph with r random neighbors for each vertex. - let mut graph = init_graph( - self.dataset.clone(), - &self.column, - self.params.r, - MetricType::L2, - rand::thread_rng(), - ) - .await?; - - // Find medoid - let medoid = find_medoid(&graph.data, self.params.metric_type).await?; - - let rng = rand::thread_rng(); - // First pass. - let now = std::time::Instant::now(); - index_once( - &mut graph, - medoid, - 1.0, - self.params.r, - self.params.l, - rng.clone(), - ) - .await?; - println!("DiskANN: first pass: {}s", now.elapsed().as_secs_f32()); - // Second pass. - let now = std::time::Instant::now(); - index_once( - &mut graph, - medoid, - self.params.alpha, - self.params.r, - self.params.l, - rng.clone(), - ) - .await?; - println!("DiskANN: second pass: {}s", now.elapsed().as_secs_f32()); - - let index_dir = self.dataset.indices_dir().child(self.uuid.as_str()); - let graph_file = index_dir.child("diskann_graph.lance"); - - let mut write_params = WriteGraphParams::default(); - write_params.batch_size = 2048 * 10; - let serde = RowVertexSerDe {}; - - write_graph( - &graph, - self.dataset.object_store(), - &graph_file, - &write_params, - &serde, - ) - .await?; - - // Write metadata +) -> Result<()> { + let rng = rand::rngs::SmallRng::from_entropy(); + + // Randomly initialize the graph with r random neighbors for each vertex. + let mut graph = init_graph(dataset, column, params.r, params.metric_type, rng.clone()).await?; + + // Find medoid + let medoid = { + let vectors = graph.data.clone(); + find_medoid(&vectors, params.metric_type).await? + }; + + // First pass. + let now = std::time::Instant::now(); + index_once(&mut graph, medoid, 1.0, params.r, params.l, rng.clone()).await?; + println!("DiskANN: first pass: {}s", now.elapsed().as_secs_f32()); + // Second pass. + let now = std::time::Instant::now(); + index_once( + &mut graph, + medoid, + params.alpha, + params.r, + params.l, + rng.clone(), + ) + .await?; + println!("DiskANN: second pass: {}s", now.elapsed().as_secs_f32()); + + let index_dir = dataset.indices_dir().child(uuid); + let graph_file = index_dir.child("diskann_graph.lance"); + + let mut write_params = WriteGraphParams::default(); + write_params.batch_size = 2048 * 10; + let serde = RowVertexSerDe {}; + + write_graph( + &graph, + dataset.object_store(), + &graph_file, + &write_params, + &serde, + ) + .await?; + + // Write metadata - Ok(()) - } + Ok(()) } /// Randomly initialize the graph with r random neighbors for each vertex. @@ -133,7 +102,7 @@ impl Builder { /// - rng: the random number generator. /// async fn init_graph( - dataset: Arc, + dataset: &Dataset, column: &str, r: usize, metric_type: MetricType, @@ -346,6 +315,8 @@ async fn index_once( mod tests { use super::*; + use std::sync::Arc; + use arrow_array::{FixedSizeListArray, RecordBatch, RecordBatchReader}; use arrow_schema::{DataType, Field, Schema as ArrowSchema}; use tempfile; @@ -390,7 +361,7 @@ mod tests { let dataset = create_dataset(uri, 200, 64).await; let rng = rand::thread_rng(); - let graph = init_graph(dataset, "vector", 10, MetricType::L2, rng) + let graph = init_graph(dataset.as_ref(), "vector", 10, MetricType::L2, rng) .await .unwrap(); diff --git a/rust/src/index/vector/graph/builder.rs b/rust/src/index/vector/graph/builder.rs index 75e74ebd336..9f25cd7a388 100644 --- a/rust/src/index/vector/graph/builder.rs +++ b/rust/src/index/vector/graph/builder.rs @@ -45,7 +45,7 @@ pub(crate) struct GraphBuilder { metric_type: MetricType, /// Distance function. - distance_func: Arc f32>, + distance_func: Arc f32 + Send + Sync>, } impl<'a, V: Vertex + Clone> GraphBuilder { diff --git a/rust/src/index/vector/ivf.rs b/rust/src/index/vector/ivf.rs index 7caa7769029..a9057c4d338 100644 --- a/rust/src/index/vector/ivf.rs +++ b/rust/src/index/vector/ivf.rs @@ -33,7 +33,6 @@ use futures::{ TryStreamExt, }; use rand::{rngs::SmallRng, SeedableRng}; -use uuid::Uuid; use super::{ opq::train_opq, @@ -494,7 +493,7 @@ pub async fn build_ivf_pq_index( dataset: &Dataset, column: &str, index_name: &str, - uuid: &Uuid, + uuid: &str, metric_type: MetricType, ivf_params: &IvfBuildParams, pq_params: &PQBuildParams, @@ -631,7 +630,7 @@ async fn write_index_file( dataset: &Dataset, column: &str, index_name: &str, - uuid: &Uuid, + uuid: &str, transformers: &[Box], mut ivf: Ivf, pq: ProductQuantizer, @@ -639,10 +638,7 @@ async fn write_index_file( batches: &[RecordBatch], ) -> Result<()> { let object_store = dataset.object_store(); - let path = dataset - .indices_dir() - .child(uuid.to_string()) - .child(INDEX_FILE_NAME); + let path = dataset.indices_dir().child(uuid).child(INDEX_FILE_NAME); let mut writer = object_store.create(&path).await?; // Write each partition to disk. From 227265adbf5793c0a746a28717d21978e0151b94 Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Wed, 19 Apr 2023 12:19:58 -0700 Subject: [PATCH 04/16] python buildable --- python/src/dataset.rs | 65 ++++++++++++++++++++++++++-------------- rust/src/index.rs | 2 +- rust/src/index/vector.rs | 11 ++++++- 3 files changed, 54 insertions(+), 24 deletions(-) diff --git a/python/src/dataset.rs b/python/src/dataset.rs index ec345096ad5..3aae124fea9 100644 --- a/python/src/dataset.rs +++ b/python/src/dataset.rs @@ -20,6 +20,8 @@ use arrow::pyarrow::*; use arrow_array::{Float32Array, RecordBatchReader}; use arrow_data::ArrayData; use arrow_schema::Schema as ArrowSchema; +use lance::index::vector::ivf::IvfBuildParams; +use lance::index::vector::pq::PQBuildParams; use pyo3::exceptions::{PyIOError, PyKeyError, PyValueError}; use pyo3::prelude::*; use pyo3::types::{IntoPyDict, PyBool, PyDict, PyInt, PyLong}; @@ -33,7 +35,7 @@ use lance::dataset::{ }; use lance::index::{ vector::{MetricType, VectorIndexParams}, - IndexType, + IndexType, DatasetIndexExt, }; const DEFAULT_NPROBS: usize = 1; @@ -275,7 +277,7 @@ impl Dataset { kwargs: Option<&PyDict>, ) -> PyResult<()> { let idx_type = match index_type.to_uppercase().as_str() { - "IVF_PQ" => IndexType::Vector, + "IVF_PQ" | "DISKANN" => IndexType::Vector, _ => { return Err(PyValueError::new_err(format!( "Index type '{index_type}' is not supported." @@ -283,31 +285,50 @@ impl Dataset { } }; - // Only VectorParams are supported. - let mut params = VectorIndexParams::default(); - if let Some(kwargs) = kwargs { - if let Some(n) = kwargs.get_item("num_partitions") { - params.num_partitions = PyAny::downcast::(n)?.extract()? - }; - - if let Some(n) = kwargs.get_item("num_sub_vectors") { - params.num_sub_vectors = PyAny::downcast::(n)?.extract()? - }; - - if let Some(o) = kwargs.get_item("use_opq") { - params.use_opq = PyAny::downcast::(o)?.extract()? - }; - if let Some(o) = kwargs.get_item("max_opq_iterations") { - params.max_opq_iterations = PyAny::downcast::(o)?.extract()? - }; - } - - params.metric_type = match metric_type { + let m_type = match metric_type { Some(mt) => MetricType::try_from(mt.to_string().to_lowercase().as_str()) .map_err(|err| PyValueError::new_err(err.to_string()))?, None => MetricType::L2, }; + // Only VectorParams are supported. + let params = match index_type.to_uppercase().as_str() { + "IVF_PQ" => { + let mut ivf_params = IvfBuildParams::default(); + let mut pq_params = PQBuildParams::default(); + if let Some(kwargs) = kwargs { + if let Some(n) = kwargs.get_item("num_partitions") { + ivf_params.num_partitions = PyAny::downcast::(n)?.extract()? + }; + + if let Some(n) = kwargs.get_item("num_bits") { + pq_params.num_bits = PyAny::downcast::(n)?.extract()? + }; + + if let Some(n) = kwargs.get_item("num_sub_vectors") { + pq_params.num_sub_vectors = PyAny::downcast::(n)?.extract()? + }; + + if let Some(o) = kwargs.get_item("use_opq") { + pq_params.use_opq = PyAny::downcast::(o)?.extract()? + }; + + if let Some(o) = kwargs.get_item("max_opq_iterations") { + pq_params.max_opq_iters = PyAny::downcast::(o)?.extract()? + }; + } + VectorIndexParams::with_ivf_pq_params(m_type, ivf_params, pq_params) + }, + "DISKANN" => { + todo!() + }, + _ => { + return Err(PyValueError::new_err(format!( + "Index type '{index_type}' is not supported." + ))) + } + }; + self_ .rt .block_on(async { diff --git a/rust/src/index.rs b/rust/src/index.rs index 601248114a5..290baf6990a 100644 --- a/rust/src/index.rs +++ b/rust/src/index.rs @@ -70,7 +70,7 @@ pub trait IndexParams: Send + Sync { /// Extends Dataset with secondary index. #[async_trait] -pub(crate) trait DatasetIndexExt { +pub trait DatasetIndexExt { /// Create indices on columns. /// /// Upon finish, a new dataset version is generated. diff --git a/rust/src/index/vector.rs b/rust/src/index/vector.rs index 3ac0f7ec592..b37da65789a 100644 --- a/rust/src/index/vector.rs +++ b/rust/src/index/vector.rs @@ -191,6 +191,7 @@ impl VectorIndexParams { ) -> Self { let mut stages: Vec> = vec![]; stages.push(Box::new(IvfBuildParams::new(num_partitions))); + let mut pq_params = PQBuildParams::default(); pq_params.num_bits = num_bits as usize; pq_params.num_sub_vectors = num_sub_vectors as usize; @@ -198,8 +199,16 @@ impl VectorIndexParams { pq_params.metric_type = metric_type; pq_params.max_iters = max_iterations; pq_params.max_opq_iters = max_iterations; - stages.push(Box::new(pq_params)); + + Self { + stages, + metric_type, + } + } + + pub fn with_ivf_pq_params(metric_type: MetricType, ivf: IvfBuildParams, pq: PQBuildParams) -> Self { + let stages: Vec> = vec![Box::new(ivf), Box::new(pq)]; Self { stages, metric_type, From f1fb164fc86520575da560f4d5292208edfb576d Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Wed, 19 Apr 2023 12:29:34 -0700 Subject: [PATCH 05/16] cargo fmt --- python/python/lance/dataset.py | 11 ++++++----- python/python/tests/test_vector_index.py | 11 ----------- python/src/dataset.rs | 5 +++-- rust/src/index/vector.rs | 16 +++++++++------- 4 files changed, 18 insertions(+), 25 deletions(-) diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index f960d6f8f8b..14bdf8edd7b 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -402,14 +402,15 @@ def create_index( ]: raise ValueError(f"Metric {metric} not supported.") index_type = index_type.upper() - if index_type != "IVF_PQ": + if index_type not in ["IVF_PQ", "DISKANN"]: raise NotImplementedError( f"Only IVF_PQ index_type supported. Got {index_type}" ) - if "num_partitions" not in kwargs or "num_sub_vectors" not in kwargs: - raise ValueError( - "num_partitions and num_sub_vectors are required for IVF_PQ" - ) + if index_type == "IVF_PQ": + if "num_partitions" not in kwargs or "num_sub_vectors" not in kwargs: + raise ValueError( + "num_partitions and num_sub_vectors are required for IVF_PQ" + ) self._ds.create_index(column, index_type, name, metric, kwargs) return LanceDataset(self.uri) diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py index 64dae668df8..f75d6057988 100644 --- a/python/python/tests/test_vector_index.py +++ b/python/python/tests/test_vector_index.py @@ -10,17 +10,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. import os import platform diff --git a/python/src/dataset.rs b/python/src/dataset.rs index 3aae124fea9..e565ff9237c 100644 --- a/python/src/dataset.rs +++ b/python/src/dataset.rs @@ -35,7 +35,7 @@ use lance::dataset::{ }; use lance::index::{ vector::{MetricType, VectorIndexParams}, - IndexType, DatasetIndexExt, + IndexType, DatasetIndexExt, vector::diskann::DiskANNParams, }; const DEFAULT_NPROBS: usize = 1; @@ -320,7 +320,8 @@ impl Dataset { VectorIndexParams::with_ivf_pq_params(m_type, ivf_params, pq_params) }, "DISKANN" => { - todo!() + let params = DiskANNParams::default(); + VectorIndexParams::with_diskann_params(m_type, params) }, _ => { return Err(PyValueError::new_err(format!( diff --git a/rust/src/index/vector.rs b/rust/src/index/vector.rs index b37da65789a..91929a52914 100644 --- a/rust/src/index/vector.rs +++ b/rust/src/index/vector.rs @@ -20,8 +20,7 @@ use std::sync::Arc; use arrow_array::Float32Array; -#[allow(dead_code)] -mod diskann; +pub mod diskann; pub mod flat; #[allow(dead_code)] mod graph; @@ -207,7 +206,11 @@ impl VectorIndexParams { } } - pub fn with_ivf_pq_params(metric_type: MetricType, ivf: IvfBuildParams, pq: PQBuildParams) -> Self { + pub fn with_ivf_pq_params( + metric_type: MetricType, + ivf: IvfBuildParams, + pq: PQBuildParams, + ) -> Self { let stages: Vec> = vec![Box::new(ivf), Box::new(pq)]; Self { stages, @@ -215,11 +218,10 @@ impl VectorIndexParams { } } - /// Create index parameters for `DiskANN` index. - pub fn diskann(r: usize, alpha: f32, l: usize, metric_type: MetricType) -> Self { - let stage = Box::new(DiskANNParams::new(r, alpha, l)); + pub fn with_diskann_params(metric_type: MetricType, diskann: DiskANNParams) -> Self { + let stages: Vec> = vec![Box::new(diskann)]; Self { - stages: vec![stage], + stages, metric_type, } } From 7f202791545120316d6e40f7ad671226b56450dc Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Wed, 19 Apr 2023 13:03:52 -0700 Subject: [PATCH 06/16] expose diskann parameters via python dataset api --- python/python/lance/dataset.py | 8 +++++--- python/src/dataset.rs | 24 +++++++++++++++++++----- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index 14bdf8edd7b..5ed7901b1a4 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -344,9 +344,11 @@ def create_index( - **use_opq**: whether to use OPQ (Optimized Product Quantization). - **max_opq_iterations**: the maximum number of iterations for training OPQ. - For SIMD, the vector dimensions / num_sub_vectors must be a multiple of the stride - depending on the platform (4, 8, 16). An error is raised if this alignment - is not met. + If `index_type` is "DISKANN", then the following parameters are optional: + + - **r**: out-degree bound + - **l**: number of levels in the graph. + - **alpha**: distance threadhold for the graph. Examples -------- diff --git a/python/src/dataset.rs b/python/src/dataset.rs index e565ff9237c..64a8d7126bf 100644 --- a/python/src/dataset.rs +++ b/python/src/dataset.rs @@ -24,7 +24,7 @@ use lance::index::vector::ivf::IvfBuildParams; use lance::index::vector::pq::PQBuildParams; use pyo3::exceptions::{PyIOError, PyKeyError, PyValueError}; use pyo3::prelude::*; -use pyo3::types::{IntoPyDict, PyBool, PyDict, PyInt, PyLong}; +use pyo3::types::{IntoPyDict, PyBool, PyDict, PyFloat, PyInt, PyLong}; use pyo3::{pyclass, PyObject, PyResult}; use tokio::runtime::Runtime; @@ -34,8 +34,9 @@ use lance::dataset::{ scanner::Scanner as LanceScanner, Dataset as LanceDataset, Version, WriteMode, WriteParams, }; use lance::index::{ + vector::diskann::DiskANNParams, vector::{MetricType, VectorIndexParams}, - IndexType, DatasetIndexExt, vector::diskann::DiskANNParams, + DatasetIndexExt, IndexType, }; const DEFAULT_NPROBS: usize = 1; @@ -318,11 +319,24 @@ impl Dataset { }; } VectorIndexParams::with_ivf_pq_params(m_type, ivf_params, pq_params) - }, + } "DISKANN" => { - let params = DiskANNParams::default(); + let mut params = DiskANNParams::default(); + if let Some(kwargs) = kwargs { + if let Some(n) = kwargs.get_item("r") { + params.r = PyAny::downcast::(n)?.extract()? + }; + + if let Some(n) = kwargs.get_item("alpha") { + params.alpha = PyAny::downcast::(n)?.extract()? + }; + + if let Some(n) = kwargs.get_item("l") { + params.l = PyAny::downcast::(n)?.extract()? + }; + } VectorIndexParams::with_diskann_params(m_type, params) - }, + } _ => { return Err(PyValueError::new_err(format!( "Index type '{index_type}' is not supported." From b9228aff796015c71e86e5f0ac7cfff9bde12888 Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Wed, 19 Apr 2023 16:34:01 -0700 Subject: [PATCH 07/16] write metadata --- rust/src/index.rs | 25 ++++++------ rust/src/index/vector/diskann.rs | 1 + rust/src/index/vector/diskann/builder.rs | 49 +++++++++++++++++++++++- 3 files changed, 60 insertions(+), 15 deletions(-) diff --git a/rust/src/index.rs b/rust/src/index.rs index 290baf6990a..f19a0760bfa 100644 --- a/rust/src/index.rs +++ b/rust/src/index.rs @@ -1,19 +1,16 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at +// Copyright 2023 Lance Developers. // -// http://www.apache.org/licenses/LICENSE-2.0 +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at // -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. //! Secondary Index //! diff --git a/rust/src/index/vector/diskann.rs b/rust/src/index/vector/diskann.rs index fac01225fe4..75d41ff8f95 100644 --- a/rust/src/index/vector/diskann.rs +++ b/rust/src/index/vector/diskann.rs @@ -26,6 +26,7 @@ use super::{ graph::{Vertex, VertexSerDe}, MetricType, VertexIndexStageParams, }; +use crate::index::pb; use crate::index::vector::pq::PQBuildParams; use crate::Result; pub(crate) use builder::build_diskann_index; diff --git a/rust/src/index/vector/diskann/builder.rs b/rust/src/index/vector/diskann/builder.rs index 4b40cbcf046..81fda6fc70b 100644 --- a/rust/src/index/vector/diskann/builder.rs +++ b/rust/src/index/vector/diskann/builder.rs @@ -24,13 +24,14 @@ use rand::{Rng, SeedableRng}; use crate::arrow::{linalg::MatrixView, *}; use crate::dataset::{Dataset, ROW_ID}; +use crate::index::pb; use crate::index::vector::diskann::row_vertex::RowVertexSerDe; use crate::index::vector::diskann::DiskANNParams; use crate::index::vector::graph::{ builder::GraphBuilder, write_graph, VertexWithDistance, WriteGraphParams, }; use crate::index::vector::graph::{Graph, Vertex}; -use crate::index::vector::MetricType; +use crate::index::vector::{MetricType, INDEX_FILE_NAME}; use crate::linalg::l2::l2_distance; use crate::{Error, Result}; @@ -311,6 +312,52 @@ async fn index_once( Ok(()) } +async fn write_index_file( + dataset: &Dataset, + column: &str, + index_name: &str, + uuid: &str, + dimension: usize, + graph_file: &str, + metric_type: MetricType, + params: &DiskANNParams, +) -> Result<()> { + let object_store = dataset.object_store(); + let path = dataset.indices_dir().child(uuid).child(INDEX_FILE_NAME); + let mut writer = object_store.create(&path).await?; + + let stages: Vec = vec![pb::VectorIndexStage { + stage: Some(pb::vector_index_stage::Stage::Diskann(pb::DiskAnn { + spec: 1, + filename: graph_file.to_string(), + r: params.r as u32, + alpha: params.alpha, + l: params.l as u32, + })), + }]; + let metadata = pb::Index { + name: index_name.to_string(), + columns: vec![column.to_string()], + dataset_version: dataset.version().version, + index_type: pb::IndexType::Vector.into(), + implementation: Some(pb::index::Implementation::VectorIndex(pb::VectorIndex { + spec_version: 1, + dimension: dimension as u32, + stages, + metric_type: match metric_type { + MetricType::L2 => pb::VectorMetricType::L2.into(), + MetricType::Cosine => pb::VectorMetricType::Cosine.into(), + }, + })), + }; + + let pos = writer.write_protobuf(&metadata).await?; + writer.write_magics(pos).await?; + writer.shutdown().await?; + + Ok(()) +} + #[cfg(test)] mod tests { use super::*; From fe0d34bb07a98b8e643fcfc22bbcb92906e52fbb Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Wed, 19 Apr 2023 16:38:30 -0700 Subject: [PATCH 08/16] write metadadta file --- rust/src/index/vector/diskann/builder.rs | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/rust/src/index/vector/diskann/builder.rs b/rust/src/index/vector/diskann/builder.rs index 81fda6fc70b..880af096a56 100644 --- a/rust/src/index/vector/diskann/builder.rs +++ b/rust/src/index/vector/diskann/builder.rs @@ -41,6 +41,7 @@ use super::search::greedy_search; pub(crate) async fn build_diskann_index( dataset: &Dataset, column: &str, + name: &str, uuid: &str, params: DiskANNParams, ) -> Result<()> { @@ -88,7 +89,17 @@ pub(crate) async fn build_diskann_index( ) .await?; - // Write metadata + write_index_file( + dataset, + column, + name, + uuid, + graph.data.num_columns(), + graph_file.to_string().as_str(), + params.metric_type, + ¶ms, + ) + .await?; Ok(()) } From 644f759d7ec89b9c731f3942d95a233944f5542e Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Wed, 19 Apr 2023 16:48:02 -0700 Subject: [PATCH 09/16] add index --- rust/src/index/vector.rs | 2 +- rust/src/index/vector/diskann.rs | 63 +++++++++++++++++++++++++++++++- 2 files changed, 63 insertions(+), 2 deletions(-) diff --git a/rust/src/index/vector.rs b/rust/src/index/vector.rs index 91929a52914..7cd54db93b5 100644 --- a/rust/src/index/vector.rs +++ b/rust/src/index/vector.rs @@ -299,7 +299,7 @@ pub(crate) async fn build_vector_index( .as_any() .downcast_ref::() .unwrap(); - build_diskann_index(dataset, column, uuid, params.clone()).await?; + build_diskann_index(dataset, column, name, uuid, params.clone()).await?; } else { return Err(Error::Index(format!( "Build Vector Index: invalid stages: {:?}", diff --git a/rust/src/index/vector/diskann.rs b/rust/src/index/vector/diskann.rs index 75d41ff8f95..f58ccb79074 100644 --- a/rust/src/index/vector/diskann.rs +++ b/rust/src/index/vector/diskann.rs @@ -26,7 +26,6 @@ use super::{ graph::{Vertex, VertexSerDe}, MetricType, VertexIndexStageParams, }; -use crate::index::pb; use crate::index::vector::pq::PQBuildParams; use crate::Result; pub(crate) use builder::build_diskann_index; @@ -163,3 +162,65 @@ impl VertexSerDe for PQVertexSerDe { bytes } } + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use arrow_array::{FixedSizeListArray, RecordBatch, RecordBatchReader}; + use arrow_schema::{DataType, Field, Schema as ArrowSchema}; + use tempfile::tempdir; + + use super::*; + use crate::{ + arrow::*, + dataset::Dataset, + index::{ + DatasetIndexExt, + {vector::VectorIndexParams, IndexType}, + }, + utils::testing::generate_random_array, + }; + + #[tokio::test] + async fn test_create_index() { + let test_dir = tempdir().unwrap(); + + let dimension = 16; + let schema = Arc::new(ArrowSchema::new(vec![Field::new( + "embeddings", + DataType::FixedSizeList( + Box::new(Field::new("item", DataType::Float32, true)), + dimension, + ), + false, + )])); + + let float_arr = generate_random_array(512 * dimension as usize); + let vectors = Arc::new(FixedSizeListArray::try_new(float_arr, dimension).unwrap()); + let batches = RecordBatchBuffer::new(vec![RecordBatch::try_new( + schema.clone(), + vec![vectors.clone()], + ) + .unwrap()]); + + let test_uri = test_dir.path().to_str().unwrap(); + + let mut reader: Box = Box::new(batches); + let dataset = Dataset::write(&mut reader, test_uri, None).await.unwrap(); + + // Make sure valid arguments should create index successfully + let params = + VectorIndexParams::with_diskann_params(MetricType::L2, DiskANNParams::default()); + let dataset = dataset + .create_index(&["embeddings"], IndexType::Vector, None, ¶ms) + .await + .unwrap(); + + // Check the version is set correctly + let indices = dataset.load_indices().await.unwrap(); + let actual = indices.first().unwrap().dataset_version; + let expected = dataset.manifest.version; + assert_eq!(actual, expected); + } +} From 32170756aad61aa27138c1efc81ecdbd61df17f3 Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Wed, 19 Apr 2023 17:54:38 -0700 Subject: [PATCH 10/16] fix an assert --- rust/src/index/vector/diskann/search.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rust/src/index/vector/diskann/search.rs b/rust/src/index/vector/diskann/search.rs index 300c7387ebd..1d6284dc9c3 100644 --- a/rust/src/index/vector/diskann/search.rs +++ b/rust/src/index/vector/diskann/search.rs @@ -59,9 +59,9 @@ impl SearchState { /// Return the next unvisited vertex. fn pop(&mut self) -> Option { while let Some(vertex) = self.heap.pop() { - debug_assert!(!self.visited.contains(&vertex.0.id)); + // println!("Pop {} visited {:?}", vertex.0.id, self.visited); - if !self.candidates.contains_key(&vertex.0.distance) { + if self.is_visited(vertex.0.id) || !self.candidates.contains_key(&vertex.0.distance) { // The vertex has been removed from the candidate lists, // from [`push()`]. continue; From c4c7dd6e20c3f739ee9555c1cf11ea67bfeec0a4 Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Wed, 19 Apr 2023 20:05:36 -0700 Subject: [PATCH 11/16] serialize entry points --- protos/index.proto | 3 +++ rust/src/index/vector/diskann/builder.rs | 3 +++ 2 files changed, 6 insertions(+) diff --git a/protos/index.proto b/protos/index.proto index f6e1cfa5b6e..e32ea477982 100644 --- a/protos/index.proto +++ b/protos/index.proto @@ -108,6 +108,9 @@ message DiskAnn { // L parameter uint32 L = 5; + + /// Entry points to the graph + repeated uint64 entries = 6; } // One stage in the vector index pipeline. diff --git a/rust/src/index/vector/diskann/builder.rs b/rust/src/index/vector/diskann/builder.rs index 880af096a56..3bb8b68e1b4 100644 --- a/rust/src/index/vector/diskann/builder.rs +++ b/rust/src/index/vector/diskann/builder.rs @@ -96,6 +96,7 @@ pub(crate) async fn build_diskann_index( uuid, graph.data.num_columns(), graph_file.to_string().as_str(), + &[medoid], params.metric_type, ¶ms, ) @@ -330,6 +331,7 @@ async fn write_index_file( uuid: &str, dimension: usize, graph_file: &str, + entries: &[usize], metric_type: MetricType, params: &DiskANNParams, ) -> Result<()> { @@ -344,6 +346,7 @@ async fn write_index_file( r: params.r as u32, alpha: params.alpha, l: params.l as u32, + entries: entries.iter().map(|v| *v as u64).collect(), })), }]; let metadata = pb::Index { From 4c945fad17bb0ff975f48afde361cd199d151453 Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Wed, 19 Apr 2023 23:01:18 -0700 Subject: [PATCH 12/16] make lq buildable --- rust/src/bin/lq.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/rust/src/bin/lq.rs b/rust/src/bin/lq.rs index ad41889ba03..f0ea12b491d 100644 --- a/rust/src/bin/lq.rs +++ b/rust/src/bin/lq.rs @@ -19,7 +19,7 @@ use futures::stream::StreamExt; use futures::TryStreamExt; use lance::dataset::Dataset; -use lance::index::vector::{MetricType, VectorIndexParams}; +use lance::index::{vector::{MetricType, VectorIndexParams}, DatasetIndexExt}; use lance::{Error, Result}; #[derive(Parser)] @@ -69,11 +69,11 @@ enum Commands { /// Nunber of IVF partitions. Only useful when the index type is 'ivf-pq'. #[arg(short = 'p', long, default_value_t = 64, value_name = "NUM")] - num_partitions: u32, + num_partitions: usize, /// Number of sub-vectors in Product Quantizer #[arg(short = 's', long, default_value_t = 8, value_name = "NUM")] - num_sub_vectors: u32, + num_sub_vectors: usize, /// Distance metric type. Only support 'l2' and 'cosine'. #[arg(short = 'm', long, value_name = "DISTANCE")] @@ -160,8 +160,8 @@ async fn create_index( name: &Option, column: &Option, index_type: &Option, - num_partitions: &u32, - num_sub_vectors: &u32, + num_partitions: &usize, + num_sub_vectors: &usize, metric_type: &Option, use_opq: bool, ) -> Result<()> { From 4bd25ddfdcaf74b13c5768bb6782c0c8077d222a Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Wed, 19 Apr 2023 23:04:15 -0700 Subject: [PATCH 13/16] cargo fmt --- rust/src/bin/lq.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/rust/src/bin/lq.rs b/rust/src/bin/lq.rs index f0ea12b491d..9d4becb532b 100644 --- a/rust/src/bin/lq.rs +++ b/rust/src/bin/lq.rs @@ -19,7 +19,10 @@ use futures::stream::StreamExt; use futures::TryStreamExt; use lance::dataset::Dataset; -use lance::index::{vector::{MetricType, VectorIndexParams}, DatasetIndexExt}; +use lance::index::{ + vector::{MetricType, VectorIndexParams}, + DatasetIndexExt, +}; use lance::{Error, Result}; #[derive(Parser)] From 08d061ab129f5dcfef578b196dd194c282c3323e Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Sat, 22 Apr 2023 21:57:26 -0700 Subject: [PATCH 14/16] make it ready for review --- rust/src/index/vector.rs | 67 ++++++++++++--------- rust/src/index/vector/diskann.rs | 52 ---------------- rust/src/index/vector/diskann/builder.rs | 3 +- rust/src/index/vector/diskann/row_vertex.rs | 1 + rust/src/index/vector/diskann/search.rs | 2 + rust/src/linalg/normalize.rs | 1 - 6 files changed, 41 insertions(+), 85 deletions(-) diff --git a/rust/src/index/vector.rs b/rust/src/index/vector.rs index 7cd54db93b5..89c10dd1b67 100644 --- a/rust/src/index/vector.rs +++ b/rust/src/index/vector.rs @@ -159,13 +159,21 @@ impl TryFrom<&str> for MetricType { } } +/// Parameters of each stage. +#[derive(Debug)] +pub enum StageParams { + Ivf(IvfBuildParams), + PQ(PQBuildParams), + DiskANN(DiskANNParams), +} + pub trait VertexIndexStageParams: std::fmt::Debug + Send + Sync { fn as_any(&self) -> &dyn Any; } /// The parameters to build vector index. pub struct VectorIndexParams { - pub stages: Vec>, + pub stages: Vec, /// Vector distance metrics type. pub metric_type: MetricType, @@ -188,8 +196,8 @@ impl VectorIndexParams { metric_type: MetricType, max_iterations: usize, ) -> Self { - let mut stages: Vec> = vec![]; - stages.push(Box::new(IvfBuildParams::new(num_partitions))); + let mut stages: Vec = vec![]; + stages.push(StageParams::Ivf(IvfBuildParams::new(num_partitions))); let mut pq_params = PQBuildParams::default(); pq_params.num_bits = num_bits as usize; @@ -198,7 +206,7 @@ impl VectorIndexParams { pq_params.metric_type = metric_type; pq_params.max_iters = max_iterations; pq_params.max_opq_iters = max_iterations; - stages.push(Box::new(pq_params)); + stages.push(StageParams::PQ(pq_params)); Self { stages, @@ -211,7 +219,7 @@ impl VectorIndexParams { ivf: IvfBuildParams, pq: PQBuildParams, ) -> Self { - let stages: Vec> = vec![Box::new(ivf), Box::new(pq)]; + let stages = vec![StageParams::Ivf(ivf), StageParams::PQ(pq)]; Self { stages, metric_type, @@ -219,7 +227,7 @@ impl VectorIndexParams { } pub fn with_diskann_params(metric_type: MetricType, diskann: DiskANNParams) -> Self { - let stages: Vec> = vec![Box::new(diskann)]; + let stages = vec![StageParams::DiskANN(diskann)]; Self { stages, metric_type, @@ -233,20 +241,22 @@ impl IndexParams for VectorIndexParams { } } -fn is_ivf_pq(stages: &[Box]) -> bool { +fn is_ivf_pq(stages: &[StageParams]) -> bool { if stages.len() < 2 { return false; } let len = stages.len(); - matches!(&stages[len - 1], PQBuildParams) && matches!(&stages[len - 2], IvfBuildParams) + matches!(&stages[len - 1], StageParams::PQ(_)) + && matches!(&stages[len - 2], StageParams::Ivf(_)) } -fn is_diskann(stages: &[Box]) -> bool { +fn is_diskann(stages: &[StageParams]) -> bool { if stages.is_empty() { return false; } - matches!(stages.last().unwrap().as_ref(), DiskANNParams) + let last = stages.last().unwrap(); + matches!(last, StageParams::DiskANN(_)) } /// Build a Vector Index @@ -268,37 +278,34 @@ pub(crate) async fn build_vector_index( if is_ivf_pq(stages) { // This is a IVF PQ index. let len = stages.len(); - let ivf_params = stages[len - 2] - .as_ref() - .as_any() - .downcast_ref::() - .unwrap(); - let pq_params = stages[len - 1] - .as_any() - .downcast_ref::() - .ok_or_else(|| { - Error::Index(format!("Build Vector Index: invalid stages: {:?}", stages)) - })?; + let StageParams::Ivf(ivf_params) = &stages[len - 2] else { + return Err(Error::Index( + format!("Build Vector Index: invalid stages: {:?}", stages), + )); + }; + let StageParams::PQ(pq_params) = &stages[len - 1] else { + return Err(Error::Index( + format!("Build Vector Index: invalid stages: {:?}", stages), + )); + }; build_ivf_pq_index( dataset, column, &name, &uuid, params.metric_type, - &ivf_params, - &pq_params, + ivf_params, + pq_params, ) .await? } else if is_diskann(stages) { // This is DiskANN index. use self::diskann::build_diskann_index; - let params = stages - .last() - .unwrap() - .as_ref() - .as_any() - .downcast_ref::() - .unwrap(); + let StageParams::DiskANN(params) = stages.last().unwrap() else { + return Err(Error::Index( + format!("Build Vector Index: invalid stages: {:?}", stages), + )); + }; build_diskann_index(dataset, column, name, uuid, params.clone()).await?; } else { return Err(Error::Index(format!( diff --git a/rust/src/index/vector/diskann.rs b/rust/src/index/vector/diskann.rs index f58ccb79074..bd217365777 100644 --- a/rust/src/index/vector/diskann.rs +++ b/rust/src/index/vector/diskann.rs @@ -19,15 +19,11 @@ mod builder; mod row_vertex; mod search; -use arrow_array::UInt8Array; -use byteorder::{ByteOrder, LE}; - use super::{ graph::{Vertex, VertexSerDe}, MetricType, VertexIndexStageParams, }; use crate::index::vector::pq::PQBuildParams; -use crate::Result; pub(crate) use builder::build_diskann_index; #[derive(Clone, Debug)] @@ -115,54 +111,6 @@ impl VertexIndexStageParams for DiskANNParams { } } -/// DiskANN Vertex with PQ as persisted data. -#[derive(Clone, Debug)] -pub(crate) struct PQVertex { - row_id: u64, - /// PQ code for the vector. - pq: UInt8Array, -} - -impl Vertex for PQVertex {} - -struct PQVertexSerDe { - num_sub_vectors: usize, - - /// Number of bits for each PQ code. - num_bits: usize, -} - -impl PQVertexSerDe { - pub(crate) fn new(num_sub_vectors: usize, num_bits: usize) -> Self { - Self { - num_sub_vectors, - num_bits, - } - } -} - -impl VertexSerDe for PQVertexSerDe { - #[inline] - fn size(&self) -> usize { - 8 /* row_id*/ + 8 /* Length of pq */ + self.num_sub_vectors * 1 /* only 8 bits now */ - } - - fn deserialize(&self, data: &[u8]) -> Result { - let row_id = LE::read_u64(&data[0..8]); - let pq_size = LE::read_u64(&data[8..16]) as usize; - let pq = UInt8Array::from_iter_values(data[16..16 + pq_size].iter().copied()); - Ok(PQVertex { row_id, pq }) - } - - fn serialize(&self, vertex: &PQVertex) -> Vec { - let mut bytes = Vec::with_capacity(self.size()); - bytes.extend_from_slice(&vertex.row_id.to_le_bytes()); - bytes.extend_from_slice(&vertex.pq.len().to_le_bytes()); - bytes.extend_from_slice(&vertex.pq.values()); - bytes - } -} - #[cfg(test)] mod tests { use std::sync::Arc; diff --git a/rust/src/index/vector/diskann/builder.rs b/rust/src/index/vector/diskann/builder.rs index 3bb8b68e1b4..a8fe147032f 100644 --- a/rust/src/index/vector/diskann/builder.rs +++ b/rust/src/index/vector/diskann/builder.rs @@ -146,8 +146,7 @@ async fn init_graph( let nodes = row_ids .values() .iter() - .enumerate() - .map(|(i, &row_id)| RowVertex::new(row_id, None)) + .map(|&row_id| RowVertex::new(row_id, None)) .collect::>(); let mut graph = GraphBuilder::new(&nodes, matrix, metric_type); diff --git a/rust/src/index/vector/diskann/row_vertex.rs b/rust/src/index/vector/diskann/row_vertex.rs index ce63a5a978d..c4306e93187 100644 --- a/rust/src/index/vector/diskann/row_vertex.rs +++ b/rust/src/index/vector/diskann/row_vertex.rs @@ -23,6 +23,7 @@ use crate::Result; pub(crate) struct RowVertex { pub(crate) row_id: u64, + #[allow(dead_code)] pub(crate) vector: Option, } diff --git a/rust/src/index/vector/diskann/search.rs b/rust/src/index/vector/diskann/search.rs index 1d6284dc9c3..4a971c4ab84 100644 --- a/rust/src/index/vector/diskann/search.rs +++ b/rust/src/index/vector/diskann/search.rs @@ -41,6 +41,8 @@ pub(crate) struct SearchState { l: usize, /// Number of results to return. + //TODO: used during search. + #[allow(dead_code)] k: usize, } diff --git a/rust/src/linalg/normalize.rs b/rust/src/linalg/normalize.rs index 7a5d2a58c07..585198af38a 100644 --- a/rust/src/linalg/normalize.rs +++ b/rust/src/linalg/normalize.rs @@ -69,7 +69,6 @@ mod aarch64 { let mut sum = unsafe { let buf = [0.0_f32; 4]; let mut sum = vld1q_f32(buf.as_ptr()); - let n = vector.len(); for i in (0..len).step_by(4) { let x = vld1q_f32(vector.as_ptr().add(i)); sum = vfmaq_f32(sum, x, x); From 9ef7bc2256b13cecfc8797dd307a627915dc0439 Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Sat, 22 Apr 2023 22:30:58 -0700 Subject: [PATCH 15/16] update message --- python/python/lance/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index 5ed7901b1a4..34232348c20 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -406,7 +406,7 @@ def create_index( index_type = index_type.upper() if index_type not in ["IVF_PQ", "DISKANN"]: raise NotImplementedError( - f"Only IVF_PQ index_type supported. Got {index_type}" + f"Only IVF_PQ or DiskANN index_types supported. Got {index_type}" ) if index_type == "IVF_PQ": if "num_partitions" not in kwargs or "num_sub_vectors" not in kwargs: From 1786316b830ea204cafde6f29485d96c5f763607 Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Sun, 23 Apr 2023 10:09:38 -0700 Subject: [PATCH 16/16] remove VertexIndexParams --- rust/src/index/vector.rs | 8 +++----- rust/src/index/vector/diskann.rs | 8 +------- rust/src/index/vector/ivf.rs | 8 +------- rust/src/index/vector/pq.rs | 8 +------- 4 files changed, 6 insertions(+), 26 deletions(-) diff --git a/rust/src/index/vector.rs b/rust/src/index/vector.rs index 89c10dd1b67..efd627fd4b6 100644 --- a/rust/src/index/vector.rs +++ b/rust/src/index/vector.rs @@ -159,16 +159,14 @@ impl TryFrom<&str> for MetricType { } } -/// Parameters of each stage. +/// Parameters of each index stage. #[derive(Debug)] pub enum StageParams { Ivf(IvfBuildParams), + PQ(PQBuildParams), - DiskANN(DiskANNParams), -} -pub trait VertexIndexStageParams: std::fmt::Debug + Send + Sync { - fn as_any(&self) -> &dyn Any; + DiskANN(DiskANNParams), } /// The parameters to build vector index. diff --git a/rust/src/index/vector/diskann.rs b/rust/src/index/vector/diskann.rs index bd217365777..ceea6364581 100644 --- a/rust/src/index/vector/diskann.rs +++ b/rust/src/index/vector/diskann.rs @@ -21,7 +21,7 @@ mod search; use super::{ graph::{Vertex, VertexSerDe}, - MetricType, VertexIndexStageParams, + MetricType, }; use crate::index::vector::pq::PQBuildParams; pub(crate) use builder::build_diskann_index; @@ -105,12 +105,6 @@ impl DiskANNParams { } } -impl VertexIndexStageParams for DiskANNParams { - fn as_any(&self) -> &dyn std::any::Any { - self - } -} - #[cfg(test)] mod tests { use std::sync::Arc; diff --git a/rust/src/index/vector/ivf.rs b/rust/src/index/vector/ivf.rs index a9057c4d338..962b0b216ba 100644 --- a/rust/src/index/vector/ivf.rs +++ b/rust/src/index/vector/ivf.rs @@ -38,7 +38,7 @@ use super::{ opq::train_opq, pq::{train_pq, PQBuildParams, ProductQuantizer}, utils::maybe_sample_training_data, - MetricType, Query, VectorIndex, VertexIndexStageParams, INDEX_FILE_NAME, + MetricType, Query, VectorIndex, INDEX_FILE_NAME, }; use crate::io::object_reader::ObjectReader; use crate::{ @@ -438,12 +438,6 @@ impl Default for IvfBuildParams { } } -impl VertexIndexStageParams for IvfBuildParams { - fn as_any(&self) -> &dyn Any { - self - } -} - impl IvfBuildParams { /// Create a new instance of `IvfBuildParams`. pub fn new(num_partitions: usize) -> Self { diff --git a/rust/src/index/vector/pq.rs b/rust/src/index/vector/pq.rs index 86a36c7eced..f19bd475a45 100644 --- a/rust/src/index/vector/pq.rs +++ b/rust/src/index/vector/pq.rs @@ -28,7 +28,7 @@ use async_trait::async_trait; use futures::{stream, StreamExt, TryStreamExt}; use rand::SeedableRng; -use super::{MetricType, Query, VectorIndex, VertexIndexStageParams}; +use super::{MetricType, Query, VectorIndex}; use crate::arrow::linalg::MatrixView; use crate::arrow::*; use crate::dataset::ROW_ID; @@ -619,12 +619,6 @@ impl Default for PQBuildParams { } } -impl VertexIndexStageParams for PQBuildParams { - fn as_any(&self) -> &dyn Any { - self - } -} - /// Train product quantization over (OPQ-rotated) residual vectors. pub(crate) async fn train_pq( data: &MatrixView,