Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions protos/index.proto
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ message VectorIndexStage {
}
}


// Metric Type for Vector Index
enum VectorMetricType {
// L2 (Euclidean) Distance
Expand Down
1 change: 1 addition & 0 deletions rust/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ datafusion = { version = "19.0.0", default-features = false }
faiss = { version = "0.11.0", features = ["gpu"], optional = true }
lapack = "0.19.0"
cblas = "0.4.0"
lru_time_cache = "0.11"

[target.'cfg(target_os = "macos")'.dependencies]
accelerate-src = "0.3.2"
Expand Down
4 changes: 4 additions & 0 deletions rust/src/arrow.rs
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,10 @@ impl FixedSizeBinaryArrayExt for FixedSizeBinaryArray {
}
}

pub fn as_fixed_size_binary_array(arr: &dyn Array) -> &FixedSizeBinaryArray {
arr.as_any().downcast_ref::<FixedSizeBinaryArray>().unwrap()
}

/// Extends Arrow's [RecordBatch].
pub trait RecordBatchExt {
/// Append a new column to this [`RecordBatch`] and returns a new RecordBatch.
Expand Down
1 change: 1 addition & 0 deletions rust/src/index/vector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ use std::sync::Arc;
use arrow_array::Float32Array;

pub mod flat;
mod graph;
pub mod ivf;
mod kmeans;
mod opq;
Expand Down
34 changes: 34 additions & 0 deletions rust/src/index/vector/graph.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//! Graph-based vector index.
//!

use crate::Result;

// TODO: remove dead_code after implementing the index.
#[allow(dead_code)]
mod builder;
#[allow(dead_code)]
mod persisted;

/// Vertex (metadata). It does not include the actual data.
pub trait Vertex: Sized {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what do the bytes represent? Or are they just arbitrary bytes at the trait level?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is just serialized vertex, so each different graph implementation can share the same on-disk graph design.

fn byte_length(&self) -> usize;

fn from_bytes(data: &[u8]) -> Result<Self>;

// TODO: impl as Into trait?
fn to_bytes(&self) -> Vec<u8>;
}
134 changes: 134 additions & 0 deletions rust/src/index/vector/graph/builder.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
// Copyright 2023 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//! Graph in memory.

use super::Vertex;

/// A graph node to hold the vertex data and its neighbors.
#[derive(Debug)]
pub(crate) struct Node<V: Vertex> {
/// The vertex metadata. will be serialized into fixed size binary in the persisted graph.
pub(crate) vertex: V,

/// Neighbors are the ids of vertex in the graph.
/// This id is not the same as the row_id in the original lance dataset.
pub(crate) neighbors: Vec<u32>,
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are the neighbors other Node's? other Vertex's? or the row id? if row id - should it be u64?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This neighour id / vertex id will be the location in the graph index file. which will be different order from "row id" that points to the original vector in the dataset.

u32 can support up to 4B vectors per index. Feel that beyond that point, we need to apply IVF first? Main consideration is that it reduces the I/O and memory footprints by half.

}

/// A Graph that allows dynamically build graph to be persisted later.
///
/// It requires all vertices to be of the same size.
pub struct GraphBuilder<V: Vertex> {
pub(crate) nodes: Vec<Node<V>>,
}

impl<V: Vertex> GraphBuilder<V> {
pub fn new() -> Self {
Self { nodes: vec![] }
}

pub fn len(&self) -> usize {
self.nodes.len()
}

pub fn is_empty(&self) -> bool {
self.nodes.is_empty()
}

pub fn vertex(&self, id: usize) -> &V {
&self.nodes[id].vertex
}

pub fn vertex_mut(&mut self, id: usize) -> &mut V {
&mut self.nodes[id].vertex
}

pub fn neighbors(&self, id: usize) -> &[u32] {
self.nodes[id].neighbors.as_slice()
}

pub fn neighbors_mut(&mut self, id: usize) -> &mut Vec<u32> {
&mut self.nodes[id].neighbors
}

pub fn add_edge(&mut self, from: usize, to: usize) {
self.nodes[from].neighbors.push(to as u32);
}
}

impl<V: Vertex> FromIterator<V> for GraphBuilder<V> {
fn from_iter<I: IntoIterator<Item = V>>(iter: I) -> Self {
let nodes: Vec<Node<V>> = iter
.into_iter()
.map(|v| Node {
vertex: v,
neighbors: vec![],
})
.collect();

GraphBuilder { nodes: nodes }
}
}

#[cfg(test)]
mod tests {
use approx::assert_relative_eq;

use super::*;
use crate::Result;

struct FooVertex {
id: u32,
val: f32,
}

impl Vertex for FooVertex {
fn byte_length(&self) -> usize {
8
}

fn from_bytes(data: &[u8]) -> Result<Self> {
Ok(Self {
id: u32::from_le_bytes(data[0..4].try_into().unwrap()),
val: f32::from_le_bytes(data[4..8].try_into().unwrap()),
})
}

fn to_bytes(&self) -> Vec<u8> {
let mut bytes = vec![];
bytes.extend_from_slice(&self.id.to_le_bytes());
bytes.extend_from_slice(&self.val.to_le_bytes());
bytes
}
}

#[test]
fn test_construct_builder() {
let mut builder: GraphBuilder<FooVertex> = (0..100)
.map(|v| FooVertex {
id: v as u32,
val: v as f32 * 0.5,
})
.collect();

assert_eq!(builder.len(), 100);
assert_eq!(builder.vertex(77).id, 77);
assert_relative_eq!(builder.vertex(77).val, 38.5);
assert!(builder.neighbors(55).is_empty());

builder.vertex_mut(88).val = 22.0;
assert_relative_eq!(builder.vertex(88).val, 22.0);
}
}
Loading