-
Notifications
You must be signed in to change notification settings - Fork 638
[Rust] Persist graph using lance file format. #756
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
ad2a2e2
a7f44a0
1aa48ba
df02a59
1033c73
82f1e98
8cc7cbb
9c802ea
4c630ee
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,34 @@ | ||
| // Copyright 2023 Lance Developers. | ||
| // | ||
| // Licensed under the Apache License, Version 2.0 (the "License"); | ||
| // you may not use this file except in compliance with the License. | ||
| // You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, software | ||
| // distributed under the License is distributed on an "AS IS" BASIS, | ||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| // See the License for the specific language governing permissions and | ||
| // limitations under the License. | ||
|
|
||
| //! Graph-based vector index. | ||
| //! | ||
|
|
||
| use crate::Result; | ||
|
|
||
| // TODO: remove dead_code after implementing the index. | ||
| #[allow(dead_code)] | ||
| mod builder; | ||
| #[allow(dead_code)] | ||
| mod persisted; | ||
|
|
||
| /// Vertex (metadata). It does not include the actual data. | ||
| pub trait Vertex: Sized { | ||
| fn byte_length(&self) -> usize; | ||
|
|
||
| fn from_bytes(data: &[u8]) -> Result<Self>; | ||
|
|
||
| // TODO: impl as Into trait? | ||
| fn to_bytes(&self) -> Vec<u8>; | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,134 @@ | ||
| // Copyright 2023 Lance Developers. | ||
| // | ||
| // Licensed under the Apache License, Version 2.0 (the "License"); | ||
| // you may not use this file except in compliance with the License. | ||
| // You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, software | ||
| // distributed under the License is distributed on an "AS IS" BASIS, | ||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| // See the License for the specific language governing permissions and | ||
| // limitations under the License. | ||
|
|
||
| //! Graph in memory. | ||
|
|
||
| use super::Vertex; | ||
|
|
||
| /// A graph node to hold the vertex data and its neighbors. | ||
| #[derive(Debug)] | ||
| pub(crate) struct Node<V: Vertex> { | ||
| /// The vertex metadata. will be serialized into fixed size binary in the persisted graph. | ||
| pub(crate) vertex: V, | ||
|
|
||
| /// Neighbors are the ids of vertex in the graph. | ||
| /// This id is not the same as the row_id in the original lance dataset. | ||
| pub(crate) neighbors: Vec<u32>, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. are the neighbors other Node's? other Vertex's? or the row id? if row id - should it be u64?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This neighour id / vertex id will be the location in the graph index file. which will be different order from "row id" that points to the original vector in the dataset.
|
||
| } | ||
|
|
||
| /// A Graph that allows dynamically build graph to be persisted later. | ||
| /// | ||
| /// It requires all vertices to be of the same size. | ||
| pub struct GraphBuilder<V: Vertex> { | ||
| pub(crate) nodes: Vec<Node<V>>, | ||
| } | ||
|
|
||
| impl<V: Vertex> GraphBuilder<V> { | ||
| pub fn new() -> Self { | ||
| Self { nodes: vec![] } | ||
| } | ||
|
|
||
| pub fn len(&self) -> usize { | ||
| self.nodes.len() | ||
| } | ||
|
|
||
| pub fn is_empty(&self) -> bool { | ||
| self.nodes.is_empty() | ||
| } | ||
|
|
||
| pub fn vertex(&self, id: usize) -> &V { | ||
| &self.nodes[id].vertex | ||
| } | ||
|
|
||
| pub fn vertex_mut(&mut self, id: usize) -> &mut V { | ||
| &mut self.nodes[id].vertex | ||
| } | ||
|
|
||
| pub fn neighbors(&self, id: usize) -> &[u32] { | ||
| self.nodes[id].neighbors.as_slice() | ||
| } | ||
|
|
||
| pub fn neighbors_mut(&mut self, id: usize) -> &mut Vec<u32> { | ||
| &mut self.nodes[id].neighbors | ||
| } | ||
|
|
||
| pub fn add_edge(&mut self, from: usize, to: usize) { | ||
| self.nodes[from].neighbors.push(to as u32); | ||
| } | ||
| } | ||
|
|
||
| impl<V: Vertex> FromIterator<V> for GraphBuilder<V> { | ||
| fn from_iter<I: IntoIterator<Item = V>>(iter: I) -> Self { | ||
| let nodes: Vec<Node<V>> = iter | ||
| .into_iter() | ||
| .map(|v| Node { | ||
| vertex: v, | ||
| neighbors: vec![], | ||
| }) | ||
| .collect(); | ||
|
|
||
| GraphBuilder { nodes: nodes } | ||
| } | ||
| } | ||
|
|
||
| #[cfg(test)] | ||
| mod tests { | ||
| use approx::assert_relative_eq; | ||
|
|
||
| use super::*; | ||
| use crate::Result; | ||
|
|
||
| struct FooVertex { | ||
| id: u32, | ||
| val: f32, | ||
| } | ||
|
|
||
| impl Vertex for FooVertex { | ||
| fn byte_length(&self) -> usize { | ||
| 8 | ||
| } | ||
|
|
||
| fn from_bytes(data: &[u8]) -> Result<Self> { | ||
| Ok(Self { | ||
| id: u32::from_le_bytes(data[0..4].try_into().unwrap()), | ||
| val: f32::from_le_bytes(data[4..8].try_into().unwrap()), | ||
| }) | ||
| } | ||
|
|
||
| fn to_bytes(&self) -> Vec<u8> { | ||
| let mut bytes = vec![]; | ||
| bytes.extend_from_slice(&self.id.to_le_bytes()); | ||
| bytes.extend_from_slice(&self.val.to_le_bytes()); | ||
| bytes | ||
| } | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_construct_builder() { | ||
| let mut builder: GraphBuilder<FooVertex> = (0..100) | ||
| .map(|v| FooVertex { | ||
| id: v as u32, | ||
| val: v as f32 * 0.5, | ||
| }) | ||
| .collect(); | ||
|
|
||
| assert_eq!(builder.len(), 100); | ||
| assert_eq!(builder.vertex(77).id, 77); | ||
| assert_relative_eq!(builder.vertex(77).val, 38.5); | ||
| assert!(builder.neighbors(55).is_empty()); | ||
|
|
||
| builder.vertex_mut(88).val = 22.0; | ||
| assert_relative_eq!(builder.vertex(88).val, 22.0); | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
what do the bytes represent? Or are they just arbitrary bytes at the trait level?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is just serialized vertex, so each different graph implementation can share the same on-disk graph design.