fix: the routing to find ground truth in sift benchmark (#1740)

lancedb · Dec 20, 2023 · a3363da · a3363da
1 parent 1586f9c
commit a3363da
Show file tree

Hide file tree

Showing 2 changed files with 59 additions and 64 deletions.
diff --git a/benchmarks/sift/metrics.py b/benchmarks/sift/metrics.py
@@ -22,51 +22,7 @@
 import lance
 import numpy as np
 import pandas as pd
-
-
-def recall(actual_sorted: np.ndarray, results: np.ndarray):
-    """
-    Recall-at-k
-
-    Parameters
-    ----------
-    actual_sorted: ndarray
-        The ground truth
-    results: ndarray
-        The ANN results
-    """
-    len = results.shape[1]    
-    recall_at_k = np.array([np.sum([1 if id in results[i, :] else 0 for id in row]) * 1.0 / len
-                            for i, row in enumerate(actual_sorted)])
-    return (recall_at_k.mean(), recall_at_k.std(), recall_at_k)
-
-
-def l2_argsort(mat, q):
-    """
-    Parameters
-    ----------
-    mat: ndarray
-        shape is (n, d) where n is number of vectors and d is number of dims
-    q: ndarray
-        shape is d, this is the query vector
-    """
-    return np.argsort(((mat - q) ** 2).sum(axis=1))
-
-
-def cosine_argsort(mat, q):
-    """
-    argsort of cosine distances
-    Parameters
-    ----------
-    mat: ndarray
-        shape is (n, d) where n is number of vectors and d is number of dims
-    q: ndarray
-        shape is d, this is the query vector
-    """
-    mat = mat / np.linalg.norm(mat, axis=1)[:, None]
-    q = q / np.linalg.norm(q)
-    scores = np.dot(mat, q)
-    return np.argsort(1 - scores)
+from lance.torch.bench_utils import ground_truth as gt_func, recall
 
 
 def get_query_vectors(uri, nsamples=1000, normalize=False):
@@ -84,14 +40,19 @@ def get_query_vectors(uri, nsamples=1000, normalize=False):
     query_vectors = duckdb.query(
         f"SELECT vector FROM tbl USING SAMPLE {nsamples}"
     ).to_df()
-    query_vectors = np.array([np.array(x) for x in query_vectors.vector.values])        
+    query_vectors = np.array([np.array(x) for x in query_vectors.vector.values])
     if normalize:
         query_vectors = query_vectors / np.linalg.norm(query_vectors, axis=1)[:, None]
-    return query_vectors
+    return query_vectors.astype(np.float32)
 
 
 def test_dataset(
-    uri, query_vectors, ground_truth, k=10, nprobes=1, refine_factor: Optional[int] = None
+    uri,
+    query_vectors,
+    ground_truth,
+    k=10,
+    nprobes=1,
+    refine_factor: Optional[int] = None,
 ):
     """
     Compute the recall for a given query configuration
@@ -109,9 +70,9 @@ def test_dataset(
     nprobes: int
         Number of probes during search
     refine_factor: int
-        Refine factor during search    
+        Refine factor during search
     """
-    dataset = lance.dataset(uri)        
+    dataset = lance.dataset(uri)
     actual_sorted = []
     results = []
 
@@ -136,7 +97,8 @@ def test_dataset(
         if i % 100 == 0:
             print(f"Done {i}")
     avg_latency = tot / ground_truth.shape[0]
-    return recall(np.array(actual_sorted), np.array(results)), avg_latency
+    recalls = recall(np.array(actual_sorted), np.array(results))
+    return recalls.mean(), avg_latency
 
 
 if __name__ == "__main__":
@@ -146,7 +108,14 @@ def test_dataset(
     parser.add_argument("-i", "--ivf-partitions", type=int, metavar="N")
     parser.add_argument("-p", "--pq", type=int, metavar="N")
     parser.add_argument("-s", "--samples", default=1000, type=int, metavar="N")
-    parser.add_argument("-q", "--queries", type=str, default=None, help="lance dataset uri containing query vectors", metavar="URI")
+    parser.add_argument(
+        "-q",
+        "--queries",
+        type=str,
+        default=None,
+        help="lance dataset uri containing query vectors",
+        metavar="URI",
+    )
     parser.add_argument("-k", "--top_k", default=10, type=int, metavar="N")
     parser.add_argument("-n", "--normalize", action="store_true")
     args = parser.parse_args()
@@ -171,21 +140,28 @@ def test_dataset(
     refine_factor = []
     recall_at_k = []
     mean_time = []
-    query_vectors = get_query_vectors(args.queries, nsamples=args.samples,
-                                      normalize=args.normalize)
-    tbl = lance.dataset(args.uri).to_table()
-    v = tbl["vector"].combine_chunks()    
-    all_vectors = v.values.to_numpy().reshape(len(tbl), v.type.list_size)    
+    query_vectors = get_query_vectors(
+        args.queries, nsamples=args.samples, normalize=args.normalize
+    )
+    ds = lance.dataset(args.uri)
+    tbl = ds.to_table()
+    v = tbl["vector"].combine_chunks()
+    all_vectors = v.values.to_numpy().reshape(len(tbl), v.type.list_size)
     print("Computing ground truth")
-    ground_truth = np.array([l2_argsort(all_vectors, query_vectors[i, :])
-                             for i in range(args.samples)])
+    start = time.time()
+    gt = (
+        gt_func(ds, "vector", query_vectors.astype(np.float32), k=args.top_k)
+        .cpu()
+        .numpy()
+    )
+    print(f"Get ground truth in: {time.time() - start:0.3f}s")
     print("Starting benchmarks")
     for n in [1, 10, 25, 50, 75, 100]:
         for rf in [None, 1, 10, 20, 30, 40, 50]:
             recalls, times = test_dataset(
                 args.uri,
                 query_vectors,
-                ground_truth,
+                gt,
                 k=args.top_k,
                 nprobes=n,
                 refine_factor=rf,
@@ -197,10 +173,10 @@ def test_dataset(
             queries.append(args.queries)
             topk.append(args.top_k)
             refine_factor.append(rf)
-            recall_at_k.append(recalls[0])
+            recall_at_k.append(recalls)
             mean_time.append(times)
             print(
-                f"nprobes: {n}, refine={rf}, recall@{args.top_k}={recalls[0]:0.3f}, mean(s)={times}"
+                f"nprobes: {n}, refine={rf}, recall@{args.top_k}={recalls:0.3f}, mean(s)={times}"
             )
 
     df = pd.DataFrame(

diff --git a/python/python/lance/torch/bench_utils.py b/python/python/lance/torch/bench_utils.py
@@ -22,7 +22,7 @@
 from .. import LanceDataset
 from . import preferred_device
 from .data import LanceDataset as PytorchLanceDataset
-from .distance import pairwise_l2
+from .distance import pairwise_cosine, pairwise_l2
 
 __all__ = ["ground_truth"]
 
@@ -106,7 +106,9 @@ def ground_truth(
         if metric_type == "l2":
             dists = pairwise_l2(query, vectors)
         elif metric_type == "cosine":
-            raise NotImplementedError("Cosine distance is not implemented yet.")
+            dists = pairwise_cosine(query, vectors, device=device)
+        else:
+            raise ValueError(f"Unknown metric type: {metric_type}")
 
         dists, row_ids = sort_tensors(dists, row_ids, k)
 
@@ -122,3 +124,20 @@ def ground_truth(
         all_dists, all_ids = sort_tensors(all_dists, all_ids, k)
 
     return all_ids
+
+
+def recall(expected: np.ndarray, actual: np.ndarray) -> np.ndarray:
+    """Recalls
+
+    Parameters
+    ----------
+    expected: ndarray
+        The ground truth
+    results: ndarray
+        The ANN results
+    """
+    assert expected.shape == actual.shape
+    recalls = np.array(
+        [np.isin(exp, act).sum() / exp.shape[0] for exp, act in zip(expected, actual)]
+    )
+    return recalls