From bd4ff0d6c900295268dd2fa5130ed0dc515909b3 Mon Sep 17 00:00:00 2001
From: Chang She <759245+changhiskhan@users.noreply.github.com>
Date: Wed, 19 Apr 2023 21:31:45 -0700
Subject: [PATCH] Speed up vector index tests by reducing dataset size

---
 python/python/tests/test_vector_index.py | 32 ++++++------------------
 1 file changed, 8 insertions(+), 24 deletions(-)

diff --git a/python/python/tests/test_vector_index.py b/python/python/tests/test_vector_index.py
index 526b5101cf1..64dae668df8 100644
--- a/python/python/tests/test_vector_index.py
+++ b/python/python/tests/test_vector_index.py
@@ -36,14 +36,14 @@
 from lance.vector import vec_to_table
 
 
-def create_table(nvec=10000, ndim=768):
+def create_table(nvec=1000, ndim=128):
     mat = np.random.randn(nvec, ndim)
     price = np.random.rand(nvec) * 100
 
     def gen_str(n):
         return "".join(random.choices(string.ascii_letters + string.digits, k=n))
 
-    meta = np.array([gen_str(1000) for _ in range(nvec)])
+    meta = np.array([gen_str(100) for _ in range(nvec)])
     tbl = (
         vec_to_table(data=mat)
         .append_column("price", pa.array(price))
@@ -64,13 +64,13 @@ def indexed_dataset(tmp_path):
     tbl = create_table()
     dataset = lance.write_dataset(tbl, tmp_path)
     yield dataset.create_index(
-        "vector", index_type="IVF_PQ", num_partitions=32, num_sub_vectors=16
+        "vector", index_type="IVF_PQ", num_partitions=4, num_sub_vectors=16
     )
 
 
 def run(ds, q=None, assert_func=None):
     if q is None:
-        q = np.random.randn(768)
+        q = np.random.randn(128)
     project = [None, ["price"], ["vector", "price"], ["vector", "meta", "price"]]
     refine = [None, 1, 2]
     filters = [None, pc.field("price") > 50.0]
@@ -119,23 +119,15 @@ def test_flat(dataset):
     print(run(dataset))
 
 
-@pytest.mark.skipif(
-    (platform.system() == "Darwin") and (platform.machine() != "arm64"),
-    reason="no neon on GHA",
-)
 def test_ann(indexed_dataset):
     print(run(indexed_dataset))
 
 
-@pytest.mark.skipif(
-    (platform.system() == "Darwin") and (platform.machine() != "arm64"),
-    reason="no neon on GHA",
-)
 def test_ann_append(tmp_path):
     tbl = create_table()
     dataset = lance.write_dataset(tbl, tmp_path)
     dataset = dataset.create_index(
-        "vector", index_type="IVF_PQ", num_partitions=32, num_sub_vectors=16
+        "vector", index_type="IVF_PQ", num_partitions=4, num_sub_vectors=16
     )
     new_data = create_table(nvec=100)
     dataset = lance.write_dataset(new_data, dataset.uri, mode="append")
@@ -147,16 +139,12 @@ def func(rs: pa.Table):
     print(run(dataset, q=np.array(q), assert_func=func))
 
 
-@pytest.mark.skipif(
-    (platform.system() == "Darwin") and (platform.machine() != "arm64"),
-    reason="no neon on GHA",
-)
 def test_use_index(dataset, tmp_path):
     ann_ds = lance.write_dataset(dataset.to_table(), tmp_path / "indexed.lance")
     ann_ds = ann_ds.create_index(
-        "vector", index_type="IVF_PQ", num_partitions=32, num_sub_vectors=16
+        "vector", index_type="IVF_PQ", num_partitions=4, num_sub_vectors=16
     )
-    q = np.random.randn(768)
+    q = np.random.randn(128)
     expected = dataset.to_table(
         columns=["id"],
         nearest={
@@ -174,15 +162,11 @@ def test_use_index(dataset, tmp_path):
     assert np.all(expected == actual)
 
 
-@pytest.mark.skipif(
-    (platform.system() == "Darwin") and (platform.machine() != "arm64"),
-    reason="no neon on GHA",
-)
 def test_has_index(dataset, tmp_path):
     assert not dataset.has_index
     ann_ds = lance.write_dataset(dataset.to_table(), tmp_path / "indexed.lance")
     ann_ds = ann_ds.create_index(
-        "vector", index_type="IVF_PQ", num_partitions=32, num_sub_vectors=16
+        "vector", index_type="IVF_PQ", num_partitions=4, num_sub_vectors=16
     )
     assert ann_ds.has_index