lancedb · rok · Nov 17, 2023 · Nov 11, 2023 · Nov 11, 2023 · Nov 14, 2023
diff --git a/python/python/lance/__init__.py b/python/python/lance/__init__.py
@@ -89,7 +89,13 @@ def dataset(
         Approximately, ``n = Total Rows / number of IVF partitions``.
         ``pq = number of PQ sub-vectors``.
     """
-    ds = LanceDataset(uri, version, block_size, commit_lock=commit_lock)
+    ds = LanceDataset(
+        uri,
+        version,
+        block_size,
+        commit_lock=commit_lock,
+        index_cache_size=index_cache_size,
+    )
     if version is None and asof is not None:
         ts_cutoff = sanitize_ts(asof)
         ver_cutoff = max(

diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py
@@ -839,6 +839,7 @@ def create_index(
         ivf_centroids: Optional[Union[np.ndarray, pa.FixedSizeListArray]] = None,
         num_sub_vectors: Optional[int] = None,
         accelerator: Optional[Union[str, "torch.Device"]] = None,
+        index_cache_size: Optional[int] = None,
         **kwargs,
     ) -> LanceDataset:
         """Create index on column.
@@ -870,6 +871,8 @@ def create_index(
             If set, use an accelerator to speed up the training process.
             Accepted accelerator: "cuda" (Nvidia GPU) and "mps" (Apple Silicon GPU).
             If not set, use the CPU.
+        index_cache_size : int, optional
+            The size of the index cache in number of entries. Default value is 256.
         kwargs :
             Parameters passed to the index building process.
 
@@ -1015,7 +1018,7 @@ def create_index(
                 kwargs["ivf_centroids"] = ivf_centroids_batch
 
         self._ds.create_index(column, index_type, name, replace, kwargs)
-        return LanceDataset(self.uri)
+        return LanceDataset(self.uri, index_cache_size=index_cache_size)
 
     @staticmethod
     def _commit(
@@ -1807,6 +1810,7 @@ def index_stats(self, index_name: str) -> Dict[str, Any]:
         index_stats = json.loads(self._ds.index_statistics(index_name))
         index_stats["num_indexed_rows"] = self._ds.count_indexed_rows(index_name)
         index_stats["num_unindexed_rows"] = self._ds.count_unindexed_rows(index_name)
+        index_stats["index_cache_entry_count"] = self._ds.index_cache_entry_count()
         return index_stats
 
 

diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py
@@ -297,6 +297,7 @@ def test_pre_populated_ivf_centroids(dataset, tmp_path: Path):
     if platform.system() == "Windows":
         expected_filepath = expected_filepath.replace("\\", "/")
     expected_statistics = {
+        "index_cache_entry_count": 1,
         "index_type": "IVF",
         "uuid": index_uuid,
         "uri": expected_filepath,
@@ -465,3 +466,46 @@ def test_knn_with_deletions(tmp_path):
     assert len(results) == 10
 
     assert expected == [r.as_py() for r in results]
+
+
+def test_index_cache_size(tmp_path):
+    rng = np.random.default_rng(seed=42)
+
+    def query_index(ds, ntimes):
+        ndim = ds.schema[0].type.list_size
+        for _ in range(ntimes):
+            ds.to_table(
+                nearest={
+                    "column": "vector",
+                    "q": rng.standard_normal(ndim),
+                },
+            )
+
+    tbl = create_table(nvec=1024, ndim=16)
+    dataset = lance.write_dataset(tbl, tmp_path / "test")
+
+    indexed_dataset = dataset.create_index(
+        "vector",
+        index_type="IVF_PQ",
+        num_partitions=128,
+        num_sub_vectors=2,
+        index_cache_size=10,
+    )
+
+    assert (
+        indexed_dataset.stats.index_stats("vector_idx")["index_cache_entry_count"] == 1
+    )
+    query_index(indexed_dataset, 1)
+    assert (
+        indexed_dataset.stats.index_stats("vector_idx")["index_cache_entry_count"] == 2
+    )
+    query_index(indexed_dataset, 128)
+    assert (
+        indexed_dataset.stats.index_stats("vector_idx")["index_cache_entry_count"] == 10
+    )
+
+    indexed_dataset = lance.LanceDataset(indexed_dataset.uri, index_cache_size=5)
+    query_index(indexed_dataset, 128)
+    assert (
+        indexed_dataset.stats.index_stats("vector_idx")["index_cache_entry_count"] == 5
+    )
diff --git a/python/src/dataset.rs b/python/src/dataset.rs
@@ -771,6 +771,10 @@ impl Dataset {
         }
     }
 
+    fn index_cache_entry_count(&self) -> PyResult<usize> {
+        Ok(self.ds.index_cache_entry_count())
+    }
+
     #[staticmethod]
     fn commit(
         dataset_uri: &str,

diff --git a/rust/lance/src/dataset.rs b/rust/lance/src/dataset.rs
@@ -1210,6 +1210,11 @@ impl Dataset {
         Version::from(self.manifest.as_ref())
     }
 
+    /// Get the number of entries currently in the index cache.
+    pub fn index_cache_entry_count(&self) -> usize {
+        self.session.index_cache.get_size()
+    }
+
     /// Get all versions.
     pub async fn versions(&self) -> Result<Vec<Version>> {
         let mut versions: Vec<Version> = self

diff --git a/rust/lance/src/dataset/scanner.rs b/rust/lance/src/dataset/scanner.rs
@@ -2566,6 +2566,7 @@ mod test {
             let reader = RecordBatchIterator::new(batches.into_iter().map(Ok), schema.clone());
             let mut dataset = Dataset::write(reader, test_uri, None).await.unwrap();
 
+            assert_eq!(dataset.index_cache_entry_count(), 0);
             dataset
                 .create_index(
                     &["vec"],
@@ -2584,6 +2585,7 @@ mod test {
             scan.refine(100);
             scan.nprobs(100);
 
+            assert_eq!(dataset.index_cache_entry_count(), 0);
             let results = scan
                 .try_into_stream()
                 .await
@@ -2592,6 +2594,7 @@ mod test {
                 .await
                 .unwrap();
 
+            assert_eq!(dataset.index_cache_entry_count(), 5);
             assert_eq!(results.len(), 1);
             let batch = &results[0];
 

diff --git a/rust/lance/src/index/append.rs b/rust/lance/src/index/append.rs
@@ -246,5 +246,6 @@ mod tests {
             .iter()
             .sum::<usize>();
         assert_eq!(row_in_index, 2000);
+        assert_eq!(dataset.index_cache_entry_count(), 6)
     }
 }
diff --git a/rust/lance/src/index/cache.rs b/rust/lance/src/index/cache.rs
@@ -39,6 +39,12 @@ impl IndexCache {
         self.vector_cache.entry_count() as usize
     }
 
+    pub(crate) fn get_size(&self) -> usize {
+        self.scalar_cache.sync();
+        self.vector_cache.sync();
+        self.scalar_cache.entry_count() as usize + self.vector_cache.entry_count() as usize
+    }
+
     /// Get an Index if present. Otherwise returns [None].
     pub(crate) fn get_scalar(&self, key: &str) -> Option<Arc<dyn ScalarIndex>> {
         self.scalar_cache.get(key)

diff --git a/rust/lance/src/session.rs b/rust/lance/src/session.rs
@@ -100,13 +100,15 @@ mod tests {
             MetricType::L2,
         ));
         let idx = Arc::new(PQIndex::new(pq, MetricType::L2));
+        assert_eq!(session.index_cache.get_size(), 0);
         session.index_cache.insert_vector("abc", idx.clone());
 
         let found = session.index_cache.get_vector("abc");
         assert!(found.is_some());
         assert_eq!(format!("{:?}", found.unwrap()), format!("{:?}", idx));
         assert!(session.index_cache.get_vector("abc").is_some());
         assert_eq!(session.index_cache.len_vector(), 1);
+        assert_eq!(session.index_cache.get_size(), 1);
 
         for iter_idx in 0..100 {
             let pq_other = Arc::new(ProductQuantizerImpl::<Float32Type>::new(