Skip to content

Commit

Permalink
Move under .stats
Browse files Browse the repository at this point in the history
  • Loading branch information
rok committed Nov 15, 2023
1 parent 0ddf083 commit b3ad2cc
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 54 deletions.
14 changes: 2 additions & 12 deletions python/python/lance/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -634,13 +634,6 @@ def version(self) -> int:
"""
return self._ds.version()

@property
def index_cache_size(self) -> int:
"""
Returns the index cache size of the dataset
"""
return self._ds.index_cache_size()

@property
def latest_version(self) -> int:
"""
Expand Down Expand Up @@ -883,7 +876,7 @@ def create_index(
kwargs["replace"] = replace

self._ds.create_index(column, index_type, name, metric, kwargs)
return LanceDataset(self.uri)
return LanceDataset(self.uri, index_cache_size=index_cache_size)

@staticmethod
def _commit(
Expand Down Expand Up @@ -1675,6 +1668,7 @@ def index_stats(self, index_name: str) -> Dict[str, Any]:
index_stats = json.loads(self._ds.index_statistics(index_name))
index_stats["num_indexed_rows"] = self._ds.count_indexed_rows(index_name)
index_stats["num_unindexed_rows"] = self._ds.count_unindexed_rows(index_name)
index_stats["index_cache_size"] = self._ds.index_cache_size()
return index_stats


Expand All @@ -1684,7 +1678,6 @@ def write_dataset(
schema: Optional[pa.Schema] = None,
mode: str = "create",
*,
index_cache_size: int = 256,
max_rows_per_file: int = 1024 * 1024,
max_rows_per_group: int = 1024,
max_bytes_per_file: int = 90 * 1024 * 1024 * 1024,
Expand All @@ -1708,8 +1701,6 @@ def write_dataset(
**overwrite** - create a new snapshot version
**append** - create a new version that is the concat of the input the
latest version (raises if uri does not exist)
index_cache_size: int, default 256
The number of index entries to cache in memory.
max_rows_per_file: int, default 1024 * 1024
The max number of rows to write before starting a new file
max_rows_per_group: int, default 1024
Expand All @@ -1734,7 +1725,6 @@ def write_dataset(

params = {
"mode": mode,
"index_cache_size": index_cache_size,
"max_rows_per_file": max_rows_per_file,
"max_rows_per_group": max_rows_per_group,
"max_bytes_per_file": max_bytes_per_file,
Expand Down
55 changes: 22 additions & 33 deletions python/python/tests/test_vector_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,41 +468,30 @@ def test_knn_with_deletions(tmp_path):


def test_index_cache_size(tmp_path):
ndim = 16
tbl = create_table(nvec=1024, ndim=ndim)
dataset = lance.write_dataset(tbl, tmp_path, index_cache_size=10)
def query_index(ds, ntimes):
ndim = ds.schema[0].type.list_size
for _ in range(ntimes):
ds.to_table(
nearest={
"column": "vector",
"q": np.random.randn(ndim),
},
)

tbl = create_table(nvec=1024, ndim=16)
dataset = lance.write_dataset(tbl, tmp_path / "test")

indexed_dataset = dataset.create_index(
"vector", index_type="IVF_PQ", num_partitions=256,
"vector", index_type="IVF_PQ", num_partitions=128,
num_sub_vectors=2, index_cache_size=10,
)
indexed_dataset = lance.LanceDataset(indexed_dataset.uri, index_cache_size=10)
# indexed_dataset = lance.write_dataset(tbl, tmp_path, index_cache_size=10)

assert indexed_dataset.index_cache_size == 0

q = np.random.randn(ndim)

indexed_dataset.to_table(
nearest={
"column": "vector",
"q": q,
},
)
assert indexed_dataset.index_cache_size == 2
assert indexed_dataset.stats.index_stats("vector_idx")["index_cache_size"] == 1
query_index(indexed_dataset, 1)
assert indexed_dataset.stats.index_stats("vector_idx")["index_cache_size"] == 2
query_index(indexed_dataset, 128)
assert indexed_dataset.stats.index_stats("vector_idx")["index_cache_size"] == 10

indexed_dataset.to_table(
nearest={
"column": "vector",
"q": q,
},
)
assert indexed_dataset.index_cache_size == 2

for _ in range(128):
indexed_dataset.to_table(
nearest={
"column": "vector",
"q": np.random.randn(ndim),
},
)
assert indexed_dataset.index_cache_size == 10
indexed_dataset = lance.LanceDataset(indexed_dataset.uri, index_cache_size=50)
query_index(indexed_dataset, 256)
assert indexed_dataset.stats.index_stats("vector_idx")["index_cache_size"] == 50
4 changes: 0 additions & 4 deletions python/src/dataset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -598,7 +598,6 @@ impl Dataset {
index_type: &str,
name: Option<String>,
metric_type: Option<&str>,
index_cache_size: Option<usize>,
kwargs: Option<&PyDict>,
) -> PyResult<()> {
let idx_type = match index_type.to_uppercase().as_str() {
Expand Down Expand Up @@ -858,9 +857,6 @@ pub fn get_write_params(options: &PyDict) -> PyResult<Option<WriteParams>> {
if let Some(mode) = options.get_item("mode") {
p.mode = parse_write_mode(mode.extract::<String>()?.as_str())?;
};
if let Some(index_cache_size) = options.get_item("index_cache_size") {
p.index_cache_size = usize::extract(index_cache_size)?;
}
if let Some(maybe_nrows) = options.get_item("max_rows_per_file") {
p.max_rows_per_file = usize::extract(maybe_nrows)?;
}
Expand Down
5 changes: 0 additions & 5 deletions rust/lance/src/dataset/write.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,6 @@ pub enum WriteMode {
/// Dataset Write Parameters
#[derive(Debug, Clone)]
pub struct WriteParams {
/// Index cache size
pub index_cache_size: usize,

/// Max number of records per file.
pub max_rows_per_file: usize,

Expand Down Expand Up @@ -85,7 +82,6 @@ pub struct WriteParams {
impl Default for WriteParams {
fn default() -> Self {
Self {
index_cache_size: 256,
max_rows_per_file: 1024 * 1024, // 1 million
max_rows_per_group: 1024,
// object-store has a 100GB limit, so we should at least make sure
Expand Down Expand Up @@ -352,7 +348,6 @@ mod tests {
.unwrap();

let write_params = WriteParams {
index_cache_size: 256,
max_rows_per_file: 1024 * 10, // Won't be limited by this
max_rows_per_group: 512,
max_bytes_per_file: 2 * 1024,
Expand Down

0 comments on commit b3ad2cc

Please sign in to comment.