Skip to content

Commit

Permalink
feat: define Flat index as a scan over VectorStorage
Browse files Browse the repository at this point in the history
ghstack-source-id: dcbf9822d8e276436ed51685967fad3773be0932
Pull Request resolved: #2380
  • Loading branch information
chebbyChefNEQ committed May 23, 2024
1 parent 88c771a commit 9c65194
Show file tree
Hide file tree
Showing 2 changed files with 108 additions and 12 deletions.
2 changes: 1 addition & 1 deletion rust/lance-index/src/vector/v3/shuffler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use lance_io::stream::RecordBatchStream;

#[async_trait::async_trait]
/// A reader that can read the shuffled partitions.
pub trait IvfShuffleReader: Send + Sync {
pub trait IvfShuffleReader {
/// Read a partition by partition_id
/// will return Ok(None) if partition_size is 0
/// check reader.partiton_size(partition_id) before calling this function
Expand Down
118 changes: 107 additions & 11 deletions rust/lance-index/src/vector/v3/subindex.rs
Original file line number Diff line number Diff line change
@@ -1,16 +1,48 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors

use arrow_array::{RecordBatch, StructArray};
use arrow_schema::SchemaRef;
use lance_core::Result;
use std::sync::Arc;

use arrow_array::{Float32Array, RecordBatch, StructArray, UInt64Array};
use arrow_schema::{DataType, Field, Schema, SchemaRef};
use itertools::Itertools;
use lance_core::{Result, ROW_ID_FIELD};
use num_traits::Num;
use roaring::RoaringBitmap;

use crate::vector::v3::storage::DistCalculator;
use crate::vector::{
graph::{OrderedFloat, OrderedNode},
DIST_COL,
};

use super::storage::VectorStore;

/// A prefilter that can be used to skip vectors during search
///
/// Note: there is a `struct PreFilter` in `lance`. However we can't depend on `lance` in `lance-index`
/// because it would create a circular dependency.
///
/// By defining a trait here, we can implement the trait for `lance::PreFilter`
/// and not have the circular dependency
pub trait PreFilter {
fn no_prefilter() -> Arc<NoPreFilter> {
Arc::new(NoPreFilter {})
}

fn should_drop(&self, id: u64) -> bool;
}

/// A prefilter that does not skip any vectors
pub struct NoPreFilter {}

impl PreFilter for NoPreFilter {
fn should_drop(&self, _id: u64) -> bool {
false
}
}

/// A sub index for IVF index
pub trait IvfSubIndex<T: Num, Store: VectorStore>: Send + Sync + Sized {
pub trait IvfSubIndex: Send + Sync + Sized {
type QueryParams: Default;

fn index_name(&self) -> &str;
Expand All @@ -20,14 +52,14 @@ pub trait IvfSubIndex<T: Num, Store: VectorStore>: Send + Sync + Sized {
/// * `query` - The query vector
/// * `k` - The number of nearest neighbors to return
/// * `params` - The query parameters
/// * `pre_filter_bitmap` - The pre filter bitmap indicating which vectors to skip
fn search(
/// * `prefilter` - The prefilter object indicating which vectors to skip
fn search<T: Num>(
&self,
query: &[T],
k: usize,
params: Self::QueryParams,
storage: &Store,
pre_filter_bitmap: Option<RoaringBitmap>,
storage: &impl VectorStore,
prefilter: Arc<impl PreFilter>,
) -> Result<RecordBatch>;

// check if the builder supports the metadata schema requested
Expand All @@ -39,7 +71,7 @@ pub trait IvfSubIndex<T: Num, Store: VectorStore>: Send + Sync + Sized {
fn load(data: StructArray) -> Result<Self>;

/// Given a vector storage, containing all the data for the IVF partition, build the sub index.
fn index(&self, storage: &Store) -> Result<()>;
fn index(&self, storage: &impl VectorStore) -> Result<()>;

/// Turn the sub index into a struct array
fn to_array(&self) -> Result<StructArray>;
Expand All @@ -55,8 +87,72 @@ pub trait IvfSubIndex<T: Num, Store: VectorStore>: Send + Sync + Sized {
///
/// The roundtrip looks like
/// IvfSubIndexBuilder.index(data).to_array()
fn build(&self, storage: &Store) -> Result<StructArray> {
fn build(&self, storage: &impl VectorStore) -> Result<StructArray> {
self.index(storage)?;
self.to_array()
}
}

/// A Flat index is any index that stores no metadata, and
/// during query, it simply scans over the storage and returns the top k results
pub struct FlatIndex {}

lazy_static::lazy_static! {
static ref ANN_SEARCH_SCHEMA: SchemaRef = Arc::new(Schema::new(vec![
Field::new(DIST_COL, DataType::Float32, true),
ROW_ID_FIELD.clone(),
]));
}

impl IvfSubIndex for FlatIndex {
type QueryParams = ();

fn index_name(&self) -> &str {
"FLAT"
}

fn search<T: Num>(
&self,
query: &[T],
k: usize,
_params: Self::QueryParams,
storage: &impl VectorStore,
prefilter: Arc<impl PreFilter>,
) -> Result<RecordBatch> {
let dist_calc = storage.dist_calculator_from_native(query);
let (row_ids, dists): (Vec<u64>, Vec<f32>) = (0..storage.len())
.filter(|&id| !prefilter.should_drop(storage.row_ids()[id]))
.map(|id| OrderedNode {
id: id as u32,
dist: OrderedFloat(dist_calc.distance(id as u32)),
})
.sorted_unstable()
.take(k)
.map(
|OrderedNode {
id,
dist: OrderedFloat(dist),
}| (storage.row_ids()[id as usize], dist),
)
.unzip();

let (row_ids, dists) = (UInt64Array::from(row_ids), Float32Array::from(dists));

Ok(RecordBatch::try_new(
ANN_SEARCH_SCHEMA.clone(),
vec![Arc::new(dists), Arc::new(row_ids)],
)?)
}

fn load(_: StructArray) -> Result<Self> {
Ok(Self {})
}

fn index(&self, _: &impl VectorStore) -> Result<()> {
Ok(())
}

fn to_array(&self) -> Result<StructArray> {
Ok(StructArray::from(vec![]))
}
}

0 comments on commit 9c65194

Please sign in to comment.