Move under .stats

lancedb · Nov 15, 2023 · b3ad2cc · b3ad2cc
1 parent 0ddf083
commit b3ad2cc
Show file tree

Hide file tree

Showing 4 changed files with 24 additions and 54 deletions.
diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py
@@ -634,13 +634,6 @@ def version(self) -> int:
         """
         return self._ds.version()
 
-    @property
-    def index_cache_size(self) -> int:
-        """
-        Returns the index cache size of the dataset
-        """
-        return self._ds.index_cache_size()
-
     @property
     def latest_version(self) -> int:
         """
@@ -883,7 +876,7 @@ def create_index(
         kwargs["replace"] = replace
 
         self._ds.create_index(column, index_type, name, metric, kwargs)
-        return LanceDataset(self.uri)
+        return LanceDataset(self.uri, index_cache_size=index_cache_size)
 
     @staticmethod
     def _commit(
@@ -1675,6 +1668,7 @@ def index_stats(self, index_name: str) -> Dict[str, Any]:
         index_stats = json.loads(self._ds.index_statistics(index_name))
         index_stats["num_indexed_rows"] = self._ds.count_indexed_rows(index_name)
         index_stats["num_unindexed_rows"] = self._ds.count_unindexed_rows(index_name)
+        index_stats["index_cache_size"] = self._ds.index_cache_size()
         return index_stats
 
 
@@ -1684,7 +1678,6 @@ def write_dataset(
     schema: Optional[pa.Schema] = None,
     mode: str = "create",
     *,
-    index_cache_size: int = 256,
     max_rows_per_file: int = 1024 * 1024,
     max_rows_per_group: int = 1024,
     max_bytes_per_file: int = 90 * 1024 * 1024 * 1024,
@@ -1708,8 +1701,6 @@ def write_dataset(
         **overwrite** - create a new snapshot version
         **append** - create a new version that is the concat of the input the
         latest version (raises if uri does not exist)
-    index_cache_size: int, default 256
-        The number of index entries to cache in memory.
     max_rows_per_file: int, default 1024 * 1024
         The max number of rows to write before starting a new file
     max_rows_per_group: int, default 1024
@@ -1734,7 +1725,6 @@ def write_dataset(
 
     params = {
         "mode": mode,
-        "index_cache_size": index_cache_size,
         "max_rows_per_file": max_rows_per_file,
         "max_rows_per_group": max_rows_per_group,
         "max_bytes_per_file": max_bytes_per_file,

diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py
@@ -468,41 +468,30 @@ def test_knn_with_deletions(tmp_path):
 
 
 def test_index_cache_size(tmp_path):
-    ndim = 16
-    tbl = create_table(nvec=1024, ndim=ndim)
-    dataset = lance.write_dataset(tbl, tmp_path, index_cache_size=10)
+    def query_index(ds, ntimes):
+        ndim = ds.schema[0].type.list_size
+        for _ in range(ntimes):
+            ds.to_table(
+                nearest={
+                    "column": "vector",
+                    "q": np.random.randn(ndim),
+                },
+            )
+
+    tbl = create_table(nvec=1024, ndim=16)
+    dataset = lance.write_dataset(tbl, tmp_path / "test")
+
     indexed_dataset = dataset.create_index(
-        "vector", index_type="IVF_PQ", num_partitions=256,
+        "vector", index_type="IVF_PQ", num_partitions=128,
         num_sub_vectors=2, index_cache_size=10,
     )
-    indexed_dataset = lance.LanceDataset(indexed_dataset.uri, index_cache_size=10)
-    # indexed_dataset = lance.write_dataset(tbl, tmp_path, index_cache_size=10)
-
-    assert indexed_dataset.index_cache_size == 0
 
-    q = np.random.randn(ndim)
-
-    indexed_dataset.to_table(
-        nearest={
-            "column": "vector",
-            "q": q,
-        },
-    )
-    assert indexed_dataset.index_cache_size == 2
+    assert indexed_dataset.stats.index_stats("vector_idx")["index_cache_size"] == 1
+    query_index(indexed_dataset, 1)
+    assert indexed_dataset.stats.index_stats("vector_idx")["index_cache_size"] == 2
+    query_index(indexed_dataset, 128)
+    assert indexed_dataset.stats.index_stats("vector_idx")["index_cache_size"] == 10
 
-    indexed_dataset.to_table(
-        nearest={
-            "column": "vector",
-            "q": q,
-        },
-    )
-    assert indexed_dataset.index_cache_size == 2
-
-    for _ in range(128):
-        indexed_dataset.to_table(
-            nearest={
-                "column": "vector",
-                "q": np.random.randn(ndim),
-            },
-        )
-    assert indexed_dataset.index_cache_size == 10
+    indexed_dataset = lance.LanceDataset(indexed_dataset.uri, index_cache_size=50)
+    query_index(indexed_dataset, 256)
+    assert indexed_dataset.stats.index_stats("vector_idx")["index_cache_size"] == 50
diff --git a/python/src/dataset.rs b/python/src/dataset.rs
@@ -598,7 +598,6 @@ impl Dataset {
         index_type: &str,
         name: Option<String>,
         metric_type: Option<&str>,
-        index_cache_size: Option<usize>,
         kwargs: Option<&PyDict>,
     ) -> PyResult<()> {
         let idx_type = match index_type.to_uppercase().as_str() {
@@ -858,9 +857,6 @@ pub fn get_write_params(options: &PyDict) -> PyResult<Option<WriteParams>> {
         if let Some(mode) = options.get_item("mode") {
             p.mode = parse_write_mode(mode.extract::<String>()?.as_str())?;
         };
-        if let Some(index_cache_size) = options.get_item("index_cache_size") {
-            p.index_cache_size = usize::extract(index_cache_size)?;
-        }
         if let Some(maybe_nrows) = options.get_item("max_rows_per_file") {
             p.max_rows_per_file = usize::extract(maybe_nrows)?;
         }

diff --git a/rust/lance/src/dataset/write.rs b/rust/lance/src/dataset/write.rs
@@ -52,9 +52,6 @@ pub enum WriteMode {
 /// Dataset Write Parameters
 #[derive(Debug, Clone)]
 pub struct WriteParams {
-    /// Index cache size
-    pub index_cache_size: usize,
-
     /// Max number of records per file.
     pub max_rows_per_file: usize,
 
@@ -85,7 +82,6 @@ pub struct WriteParams {
 impl Default for WriteParams {
     fn default() -> Self {
         Self {
-            index_cache_size: 256,
             max_rows_per_file: 1024 * 1024, // 1 million
             max_rows_per_group: 1024,
             // object-store has a 100GB limit, so we should at least make sure
@@ -352,7 +348,6 @@ mod tests {
         .unwrap();
 
         let write_params = WriteParams {
-            index_cache_size: 256,
             max_rows_per_file: 1024 * 10, // Won't be limited by this
             max_rows_per_group: 512,
             max_bytes_per_file: 2 * 1024,