# Vector Search Python SDK example usage

This notebook demonstrates usage of the Vector Search Python SDK, which provides a `VectorSearchClient` as a primary API for working with Vector Search.

Alternatively, you may call the REST API directly.

**Pre-req**: This notebook assumes you have already created a Model Serving endpoint for the embedding model.  See `embedding_model_endpoint` below, and the companion notebook for creating endpoints.

In [0]:
%pip install --upgrade --force-reinstall databricks-vectorsearch
dbutils.library.restartPython()

In [0]:
from databricks.vector_search.client import VectorSearchClient

vsc = VectorSearchClient()

In [0]:
help(VectorSearchClient)

## Load toy dataset into source Delta table

In [0]:
# We will create the following source Delta table.
source_catalog = "ling_test_demo"
source_schema = "default"
source_table = "en_wiki"
source_table_fullname = f"ling_test_demo.default.en_wiki"

In [0]:
# Uncomment the below, if you need to create a catalog for the source data.

spark.sql(f"CREATE CATALOG IF NOT EXISTS {source_catalog}")

In [0]:
# Uncomment to create the source schema, if needed.

spark.sql(f"CREATE SCHEMA IF NOT EXISTS {source_catalog}.{source_schema} COMMENT 'This is a schema for source data for Vector Search indexes.'")

In [0]:
# Uncomment if you want to start from scratch.

spark.sql(f"DROP TABLE {source_table_fullname}")

In [0]:
source_df = spark.read.parquet("dbfs:/databricks-datasets/wikipedia-datasets/data-001/en_wikipedia/articles-only-parquet").limit(10)
display(source_df)

In [0]:
source_table_fullname

In [0]:
source_df.write.format("delta").option("delta.enableChangeDataFeed", "true").saveAsTable(source_table_fullname)

In [0]:
display(spark.sql(f"SELECT * FROM {source_table_fullname}"))


## Create Vector Search Endpoint

In [0]:
vector_search_endpoint_name = "vector-search-endpoint"

In [0]:
vsc.create_endpoint(
    name=vector_search_endpoint_name,
    endpoint_type="STANDARD"
)

In [0]:
endpoint = vsc.get_endpoint(
  name=vector_search_endpoint_name)
endpoint

## Create vector index

In [0]:
# Vector index
vs_index = "wiki_index"
vs_index_fullname = f"ling_test_demo.default.en_wiki"

embedding_model_endpoint = "vector_search_demo-e5-small-v2"

In [0]:
vector_search_endpoint_name

In [0]:
index = vsc.create_delta_sync_index(
  endpoint_name="vector-search-endpoint",
  source_table_name="ling_test_demo.default.en_wiki",
  index_name="ling_test_demo.default.en_wiki_index",
  pipeline_type='TRIGGERED',
  primary_key="id",
  embedding_source_column="text",
  embedding_model_endpoint_name="databricks-bge-large-en"
)
index.describe()

## Get a vector index  

In [0]:
# Get a vector index
## Use the get_index() method to retrieve the vector index object using the vector index name
index = vsc.get_index(endpoint_name=vector_search_endpoint_name, index_name=vs_index_fullname)
## Use the describe() method on the index object to see a summary of the index's configuration information
index.describe()

In [0]:
# Wait for index to become online. Expect this command to take several minutes.
import time
while not index.describe().get('status').get('status').startswith('ONLINE'):
  print("Waiting for index to be ONLINE...")
  time.sleep(5)
print("Index is ONLINE")
index.describe()

## Similarity search

Query the Vector Index to find similar documents!

In [0]:
# returns [col1, col2, ...]
# this can be set to any subset of the columns
all_columns = spark.table(source_table_fullname).columns

results = index.similarity_search(
  query_text="Greek myths",
  columns=all_columns,
  num_results=2)

results

In [0]:
# Search with a filter.
results = index.similarity_search(
  query_text="Greek myths",
  columns=all_columns,
  filters={"id NOT": ("13770", "88231")},
  num_results=2)

results


## Convert results to LangChain documents

The first column retrieved is loaded into page_content and the rest into metadata.

In [0]:
from langchain.schema import Document

def convert_vector_search_to_documents(results) -> List[Document]
  column_names = []
  for column in results["manifest"]["columns"]:
      column_names.append(column)

  langchain_docs = []
  for item in results["result"]["data_array"]:
      metadata = {}
      score = item[-1]
      # print(score)
      i = 1
      for field in item[1:-1]:
          # print(field + "--")
          metadata[column_names[i]["name"]] = field
          i = i + 1
      doc = Document(page_content=item[0], metadata=metadata)  # , 9)
      langchain_docs.append(doc)
  return langchain_docs

langchain_docs = convert_vector_search_to_documents(results)

langchain_docs

## Delete vector index

In [0]:
vsc.delete_index(index_name=vs_index_fullname)