-
Notifications
You must be signed in to change notification settings - Fork 184
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
- Loading branch information
1 parent
83ecc01
commit 3dd63b8
Showing
13 changed files
with
195 additions
and
13 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
// SPDX-License-Identifier: Apache-2.0 | ||
// SPDX-FileCopyrightText: Copyright The Lance Authors | ||
|
||
pub mod shuffler; | ||
pub mod storage; | ||
pub mod subindex; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
// SPDX-License-Identifier: Apache-2.0 | ||
// SPDX-FileCopyrightText: Copyright The Lance Authors | ||
|
||
//! Shuffler is a component that takes a stream of record batches and shuffles them into | ||
//! the corresponding IVF partitions. | ||
|
||
use lance_core::Result; | ||
use lance_io::stream::RecordBatchStream; | ||
|
||
#[async_trait::async_trait] | ||
/// A reader that can read the shuffled partitions. | ||
pub trait IvfShuffleReader { | ||
/// Read a partition by partition_id | ||
/// will return Ok(None) if partition_size is 0 | ||
/// check reader.partiton_size(partition_id) before calling this function | ||
async fn read_partition( | ||
&self, | ||
partition_id: usize, | ||
) -> Result<Option<Box<dyn RecordBatchStream + Unpin + 'static>>>; | ||
|
||
/// Get the size of the partition by partition_id | ||
fn partiton_size(&self, partition_id: usize) -> Result<usize>; | ||
} | ||
|
||
#[async_trait::async_trait] | ||
/// A shuffler that can shuffle the incoming stream of record batches into IVF partitions. | ||
/// Returns a IvfShuffleReader that can be used to read the shuffled partitions. | ||
pub trait IvfShuffler { | ||
/// Shuffle the incoming stream of record batches into IVF partitions. | ||
/// Returns a IvfShuffleReader that can be used to read the shuffled partitions. | ||
async fn shuffle( | ||
mut self, | ||
data: Box<dyn RecordBatchStream + Unpin + 'static>, | ||
) -> Result<Box<dyn IvfShuffleReader>>; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
// SPDX-License-Identifier: Apache-2.0 | ||
// SPDX-FileCopyrightText: Copyright The Lance Authors | ||
|
||
use std::sync::Arc; | ||
|
||
use arrow_array::{Float32Array, RecordBatch, StructArray, UInt64Array}; | ||
use arrow_schema::{DataType, Field, Schema, SchemaRef}; | ||
use itertools::Itertools; | ||
use lance_core::{Result, ROW_ID_FIELD}; | ||
use num_traits::Num; | ||
|
||
use crate::vector::v3::storage::DistCalculator; | ||
use crate::vector::{ | ||
graph::{OrderedFloat, OrderedNode}, | ||
DIST_COL, | ||
}; | ||
|
||
use super::storage::VectorStore; | ||
|
||
/// A prefilter that can be used to skip vectors during search | ||
/// | ||
/// Note: there is a `struct PreFilter` in `lance`. However we can't depend on `lance` in `lance-index` | ||
/// because it would create a circular dependency. | ||
/// | ||
/// By defining a trait here, we can implement the trait for `lance::PreFilter` | ||
/// and not have the circular dependency | ||
pub trait PreFilter { | ||
fn no_prefilter() -> Arc<NoPreFilter> { | ||
Arc::new(NoPreFilter {}) | ||
} | ||
|
||
fn should_drop(&self, id: u64) -> bool; | ||
} | ||
|
||
/// A prefilter that does not skip any vectors | ||
pub struct NoPreFilter {} | ||
|
||
impl PreFilter for NoPreFilter { | ||
fn should_drop(&self, _id: u64) -> bool { | ||
false | ||
} | ||
} | ||
|
||
/// A sub index for IVF index | ||
pub trait IvfSubIndex: Send + Sync + Sized { | ||
type QueryParams: Default; | ||
|
||
fn name(&self) -> &str; | ||
|
||
/// Search the sub index for nearest neighbors. | ||
/// # Arguments: | ||
/// * `query` - The query vector | ||
/// * `k` - The number of nearest neighbors to return | ||
/// * `params` - The query parameters | ||
/// * `prefilter` - The prefilter object indicating which vectors to skip | ||
fn search<T: Num>( | ||
&self, | ||
query: &[T], | ||
k: usize, | ||
params: Self::QueryParams, | ||
storage: &impl VectorStore, | ||
prefilter: Arc<impl PreFilter>, | ||
) -> Result<RecordBatch>; | ||
|
||
/// Load the sub index from a struct array with a single row | ||
fn load(data: StructArray) -> Result<Self>; | ||
|
||
/// Given a vector storage, containing all the data for the IVF partition, build the sub index. | ||
fn index_vectors(&self, storage: &impl VectorStore) -> Result<()>; | ||
|
||
/// Encode the sub index into a struct array | ||
fn to_array(&self) -> Result<StructArray>; | ||
} | ||
|
||
/// A Flat index is any index that stores no metadata, and | ||
/// during query, it simply scans over the storage and returns the top k results | ||
pub struct FlatIndex {} | ||
|
||
lazy_static::lazy_static! { | ||
static ref ANN_SEARCH_SCHEMA: SchemaRef = Arc::new(Schema::new(vec![ | ||
Field::new(DIST_COL, DataType::Float32, true), | ||
ROW_ID_FIELD.clone(), | ||
])); | ||
} | ||
|
||
impl IvfSubIndex for FlatIndex { | ||
type QueryParams = (); | ||
|
||
fn name(&self) -> &str { | ||
"FLAT" | ||
} | ||
|
||
fn search<T: Num>( | ||
&self, | ||
query: &[T], | ||
k: usize, | ||
_params: Self::QueryParams, | ||
storage: &impl VectorStore, | ||
prefilter: Arc<impl PreFilter>, | ||
) -> Result<RecordBatch> { | ||
let dist_calc = storage.dist_calculator_from_native(query); | ||
let (row_ids, dists): (Vec<u64>, Vec<f32>) = (0..storage.len()) | ||
.filter(|&id| !prefilter.should_drop(storage.row_ids()[id])) | ||
.map(|id| OrderedNode { | ||
id: id as u32, | ||
dist: OrderedFloat(dist_calc.distance(id as u32)), | ||
}) | ||
.sorted_unstable() | ||
.take(k) | ||
.map( | ||
|OrderedNode { | ||
id, | ||
dist: OrderedFloat(dist), | ||
}| (storage.row_ids()[id as usize], dist), | ||
) | ||
.unzip(); | ||
|
||
let (row_ids, dists) = (UInt64Array::from(row_ids), Float32Array::from(dists)); | ||
|
||
Ok(RecordBatch::try_new( | ||
ANN_SEARCH_SCHEMA.clone(), | ||
vec![Arc::new(dists), Arc::new(row_ids)], | ||
)?) | ||
} | ||
|
||
fn load(_: StructArray) -> Result<Self> { | ||
Ok(Self {}) | ||
} | ||
|
||
fn index_vectors(&self, _: &impl VectorStore) -> Result<()> { | ||
Ok(()) | ||
} | ||
|
||
fn to_array(&self) -> Result<StructArray> { | ||
Ok(StructArray::from(vec![])) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters