Skip to content

Commit

Permalink
fix: the routing to find ground truth in sift benchmark (#1740)
Browse files Browse the repository at this point in the history
  • Loading branch information
eddyxu committed Dec 20, 2023
1 parent 1586f9c commit a3363da
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 64 deletions.
100 changes: 38 additions & 62 deletions benchmarks/sift/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,51 +22,7 @@
import lance
import numpy as np
import pandas as pd


def recall(actual_sorted: np.ndarray, results: np.ndarray):
"""
Recall-at-k
Parameters
----------
actual_sorted: ndarray
The ground truth
results: ndarray
The ANN results
"""
len = results.shape[1]
recall_at_k = np.array([np.sum([1 if id in results[i, :] else 0 for id in row]) * 1.0 / len
for i, row in enumerate(actual_sorted)])
return (recall_at_k.mean(), recall_at_k.std(), recall_at_k)


def l2_argsort(mat, q):
"""
Parameters
----------
mat: ndarray
shape is (n, d) where n is number of vectors and d is number of dims
q: ndarray
shape is d, this is the query vector
"""
return np.argsort(((mat - q) ** 2).sum(axis=1))


def cosine_argsort(mat, q):
"""
argsort of cosine distances
Parameters
----------
mat: ndarray
shape is (n, d) where n is number of vectors and d is number of dims
q: ndarray
shape is d, this is the query vector
"""
mat = mat / np.linalg.norm(mat, axis=1)[:, None]
q = q / np.linalg.norm(q)
scores = np.dot(mat, q)
return np.argsort(1 - scores)
from lance.torch.bench_utils import ground_truth as gt_func, recall


def get_query_vectors(uri, nsamples=1000, normalize=False):
Expand All @@ -84,14 +40,19 @@ def get_query_vectors(uri, nsamples=1000, normalize=False):
query_vectors = duckdb.query(
f"SELECT vector FROM tbl USING SAMPLE {nsamples}"
).to_df()
query_vectors = np.array([np.array(x) for x in query_vectors.vector.values])
query_vectors = np.array([np.array(x) for x in query_vectors.vector.values])
if normalize:
query_vectors = query_vectors / np.linalg.norm(query_vectors, axis=1)[:, None]
return query_vectors
return query_vectors.astype(np.float32)


def test_dataset(
uri, query_vectors, ground_truth, k=10, nprobes=1, refine_factor: Optional[int] = None
uri,
query_vectors,
ground_truth,
k=10,
nprobes=1,
refine_factor: Optional[int] = None,
):
"""
Compute the recall for a given query configuration
Expand All @@ -109,9 +70,9 @@ def test_dataset(
nprobes: int
Number of probes during search
refine_factor: int
Refine factor during search
Refine factor during search
"""
dataset = lance.dataset(uri)
dataset = lance.dataset(uri)
actual_sorted = []
results = []

Expand All @@ -136,7 +97,8 @@ def test_dataset(
if i % 100 == 0:
print(f"Done {i}")
avg_latency = tot / ground_truth.shape[0]
return recall(np.array(actual_sorted), np.array(results)), avg_latency
recalls = recall(np.array(actual_sorted), np.array(results))
return recalls.mean(), avg_latency


if __name__ == "__main__":
Expand All @@ -146,7 +108,14 @@ def test_dataset(
parser.add_argument("-i", "--ivf-partitions", type=int, metavar="N")
parser.add_argument("-p", "--pq", type=int, metavar="N")
parser.add_argument("-s", "--samples", default=1000, type=int, metavar="N")
parser.add_argument("-q", "--queries", type=str, default=None, help="lance dataset uri containing query vectors", metavar="URI")
parser.add_argument(
"-q",
"--queries",
type=str,
default=None,
help="lance dataset uri containing query vectors",
metavar="URI",
)
parser.add_argument("-k", "--top_k", default=10, type=int, metavar="N")
parser.add_argument("-n", "--normalize", action="store_true")
args = parser.parse_args()
Expand All @@ -171,21 +140,28 @@ def test_dataset(
refine_factor = []
recall_at_k = []
mean_time = []
query_vectors = get_query_vectors(args.queries, nsamples=args.samples,
normalize=args.normalize)
tbl = lance.dataset(args.uri).to_table()
v = tbl["vector"].combine_chunks()
all_vectors = v.values.to_numpy().reshape(len(tbl), v.type.list_size)
query_vectors = get_query_vectors(
args.queries, nsamples=args.samples, normalize=args.normalize
)
ds = lance.dataset(args.uri)
tbl = ds.to_table()
v = tbl["vector"].combine_chunks()
all_vectors = v.values.to_numpy().reshape(len(tbl), v.type.list_size)
print("Computing ground truth")
ground_truth = np.array([l2_argsort(all_vectors, query_vectors[i, :])
for i in range(args.samples)])
start = time.time()
gt = (
gt_func(ds, "vector", query_vectors.astype(np.float32), k=args.top_k)
.cpu()
.numpy()
)
print(f"Get ground truth in: {time.time() - start:0.3f}s")
print("Starting benchmarks")
for n in [1, 10, 25, 50, 75, 100]:
for rf in [None, 1, 10, 20, 30, 40, 50]:
recalls, times = test_dataset(
args.uri,
query_vectors,
ground_truth,
gt,
k=args.top_k,
nprobes=n,
refine_factor=rf,
Expand All @@ -197,10 +173,10 @@ def test_dataset(
queries.append(args.queries)
topk.append(args.top_k)
refine_factor.append(rf)
recall_at_k.append(recalls[0])
recall_at_k.append(recalls)
mean_time.append(times)
print(
f"nprobes: {n}, refine={rf}, recall@{args.top_k}={recalls[0]:0.3f}, mean(s)={times}"
f"nprobes: {n}, refine={rf}, recall@{args.top_k}={recalls:0.3f}, mean(s)={times}"
)

df = pd.DataFrame(
Expand Down
23 changes: 21 additions & 2 deletions python/python/lance/torch/bench_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from .. import LanceDataset
from . import preferred_device
from .data import LanceDataset as PytorchLanceDataset
from .distance import pairwise_l2
from .distance import pairwise_cosine, pairwise_l2

__all__ = ["ground_truth"]

Expand Down Expand Up @@ -106,7 +106,9 @@ def ground_truth(
if metric_type == "l2":
dists = pairwise_l2(query, vectors)
elif metric_type == "cosine":
raise NotImplementedError("Cosine distance is not implemented yet.")
dists = pairwise_cosine(query, vectors, device=device)
else:
raise ValueError(f"Unknown metric type: {metric_type}")

dists, row_ids = sort_tensors(dists, row_ids, k)

Expand All @@ -122,3 +124,20 @@ def ground_truth(
all_dists, all_ids = sort_tensors(all_dists, all_ids, k)

return all_ids


def recall(expected: np.ndarray, actual: np.ndarray) -> np.ndarray:
"""Recalls
Parameters
----------
expected: ndarray
The ground truth
results: ndarray
The ANN results
"""
assert expected.shape == actual.shape
recalls = np.array(
[np.isin(exp, act).sum() / exp.shape[0] for exp, act in zip(expected, actual)]
)
return recalls

0 comments on commit a3363da

Please sign in to comment.