In [0]:
%pip install lancedb numpy
dbutils.library.restartPython()

In [0]:
import numpy as np
import lancedb

def create_arrays(n, dimensions):
    return [np.random.randint(0, 256, size=dimensions).astype(np.float16) for _ in range(n)]
  

In [0]:
%sh
mkdir -p /tmp/lancedb

In [0]:
uri = "/tmp/lancedb"
db = lancedb.connect(uri)

In [0]:
import pandas as pd
import numpy as np
import random

num_vectors = 1_000_000_000
  
def create_arrays(n, dimensions):
    return [np.random.randint(0, 256, size=dimensions).astype(np.float16) for _ in range(n)]
  

data = pd.DataFrame(
    {
        "vector": create_arrays(num_vectors, dimensions=35),
        "id": np.arange(num_vectors),
    }
)
tbl = db.create_table("my_table_pandas_100m", data, mode="overwrite")
# Create IVF index on top of table to improve latency but decrease accuracy
# https://lancedb.github.io/lancedb/ann_indexes/#how-to-choose-num_partitions-and-num_sub_vectors-for-ivf_pq-index
# - num_sub_vectors = The number should be a factor of the vector dimension. Because PQ is a lossy compression of the original vector, a higher num_sub_vectors usually results in less space distortion, and thus yields better accuracy.
# - num_partitions = On SIFT-1M dataset, our benchmark shows that keeping each partition 4K-8K rows lead to a good latency / recall.
tbl.create_index(metric="l2", num_partitions=1000, num_sub_vectors=5)

In [0]:
# # 20s for brute force search with 100 queries and returning top 5
# # 2s for IVF search with 100 queries
# # 15s for IVF search w/ 1k queries
# n_test_vectors = 1000
# test_micro_batch = create_arrays(n_test_vectors, 35)
# results = tbl.search(test_micro_batch).limit(5).to_pandas()

In [0]:
n_test_vectors = 1000
test_micro_batch = create_arrays(n_test_vectors, 35)

def search_query(query_vector):
    """Function to be executed in parallel for each query vector."""
    # Re-establish connection or ensure thread-safety if db connection is not picklable
    # For local LanceDB, you might need to re-connect in each process if the connection object isn't serializable
    # or if the underlying Rust objects are not designed for cross-process sharing.
    # Often, opening a new connection per process/thread is safer.
    local_db = lancedb.connect(uri) # Connect in each process
    local_tbl = local_db.open_table("my_table_pandas_100m")
    
    return local_tbl.search(query_vector).limit(5).to_pandas()

search_query(test_micro_batch)

In [0]:
# --- ThreadPool Search Function ---
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

def search_single_vector(table, vector, limit):
    """
    Helper function to perform a single LanceDB search query.
    This function will be executed by each thread in the thread pool.
    """
    return table.search(vector).limit(limit).to_pandas()

def threaded_lancedb_search(table, test_vectors, limit=1, max_workers=None):
    """
    Performs LanceDB searches for a list of test vectors using a ThreadPoolExecutor.

    Args:
        table: The LanceDB table object to search against.
        test_vectors: A list of vectors (e.g., NumPy arrays) to query.
        limit: The maximum number of results to return for each query.
        max_workers: The maximum number of threads to use. If None, it defaults
                     to the number of processors on the machine, multiplied by 5,
                     if the number of processors is less than 5. Otherwise, it
                     defaults to the number of processors plus 4.

    Returns:
        A pandas DataFrame containing the combined results from all searches.
    """
    all_results = []
    # Using ThreadPoolExecutor for concurrent execution
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit each search query to the executor
        # map() applies the search_single_vector function to each vector in test_vectors
        # The 'table' and 'limit' arguments are fixed for all calls.
        futures = executor.map(lambda vec: search_single_vector(table, vec, limit), test_vectors)

        # Collect results as they complete
        for i, res_df in enumerate(futures):
            all_results.append(res_df)
            # print(f"Query {i+1}/{len(test_vectors)} completed.") # Optional: progress indicator

    # Concatenate all individual DataFrames into one large DataFrame
    if all_results:
        return pd.concat(all_results, ignore_index=True)
    else:
        return pd.DataFrame() # Return empty DataFrame if no results

# --- Example Usage ---
n_test_vectors = 10_000
test_micro_batch = create_arrays(n_test_vectors, 35)

start = time.time()
print(f"\nStarting threaded search for {n_test_vectors} queries...")

# Perform the threaded search
# You can adjust max_workers based on your system's capabilities and workload
results_threaded = threaded_lancedb_search(tbl, test_micro_batch, limit=5, max_workers=64) 
run_time = time.time() - start
print(f"Threaded search completed in {run_time:.2f} seconds.")