- https://api-docs.databricks.com/python/vector-search/databricks.vector_search.html

In [0]:
%pip install --force-reinstall databricks_vectorsearch 
%pip install --force-reinstall -v langchain openai
%pip install -U -qqqq databricks-sdk mlflow mlflow-skinny
dbutils.library.restartPython()

In [0]:
from databricks.sdk import WorkspaceClient
from databricks.sdk.service.vectorsearch import EndpointType
w = WorkspaceClient()

In [0]:
from databricks.vector_search.client import VectorSearchClient
vsc = VectorSearchClient()

# Endpoints

In [0]:
# pick one that already exists, creation below will throw exception
VECTOR_SEARCH_ENDPOINT = "one-env-shared-endpoint-16"

In [0]:
# vsc.list_endpoints()
vector_search_endpoints = w.vector_search_endpoints.list_endpoints()
for ve in vector_search_endpoints:
    print(ve.name)

In [0]:
endpoint = vsc.get_endpoint(name=VECTOR_SEARCH_ENDPOINT)
endpoint

In [0]:
try:
  vsc.create_endpoint(name=VECTOR_SEARCH_ENDPOINT, endpoint_type="STANDARD")
except Exception as e:
  print(e)
  if "already exists" in str(e):
    pass
  else:
    raise e

In [0]:
if sum([VECTOR_SEARCH_ENDPOINT == ve.name for ve in vector_search_endpoints]) == 0:
    print(f"Please wait, creating Vector Search endpoint `{VECTOR_SEARCH_ENDPOINT}`.  This can take up to 20 minutes...")
    w.vector_search_endpoints.create_endpoint_and_wait(VECTOR_SEARCH_ENDPOINT, endpoint_type=EndpointType.STANDARD)

# Make sure vector search endpoint is online and ready.
w.vector_search_endpoints.wait_get_endpoint_vector_search_endpoint_online(VECTOR_SEARCH_ENDPOINT)

print(f"PASS: Vector Search endpoint `{VECTOR_SEARCH_ENDPOINT}` exists")

# Indexes

In [0]:
vsc.list_indexes(VECTOR_SEARCH_ENDPOINT)

In [0]:
%sql
-- HACK - should be done at data curation/ingestion, etc. 
ALTER TABLE juan_dev.genai.amazon_reviews_mktpl ADD COLUMN embedding_source_joined STRING;

UPDATE juan_dev.genai.amazon_reviews_mktpl
SET embedding_source_joined = CONCAT(PRODUCT_NAME, ' ', REVIEW_HEADER, ' ', REVIEW_TEXT);

In [0]:
# THIS INDEX is based on Amazon Reviews notebook here. 
# dbrx-de/data/amazon-reviews

SOURCE_TABLE_FULLNAME = "juan_dev.genai.amazon_reviews_mktpl"
VS_INDEX_FULLNAME = "juan_dev.genai.amazon_reviews_mktpl_vsidx"
EMBEDDING_SOURCE_COLUMN = "embedding_source_joined"
PRIMARY_KEY = "REVIEW_ID"
# set up an index with managed embeddings
i=vsc.create_delta_sync_index(
    endpoint_name=VECTOR_SEARCH_ENDPOINT,
    index_name=VS_INDEX_FULLNAME,
    source_table_name=SOURCE_TABLE_FULLNAME,
    pipeline_type="TRIGGERED",
    primary_key=PRIMARY_KEY,
    embedding_source_column=EMBEDDING_SOURCE_COLUMN,
    embedding_model_endpoint_name="databricks-bge-large-en",
    # columns_to_sync=["revisionUsername", "revisionUsernameId","title", "revisionId", "revisionTimestamp"]
)

## Sync the Vector Search Index
Because we specified `pipeline_type="TRIGGERED"` when configuring the Delta Index, we still need to manually tell the index to sync with the delta table. This will take a few minutes.

This will not work if the index is not ready yet. We use the `wait_until_ready` method to 

In [0]:
# Sync
index = vsc.get_index(endpoint_name=VECTOR_SEARCH_ENDPOINT,index_name=VS_INDEX_FULLNAME)
print(index)

In [0]:
index.wait_until_ready(verbose=True)
index.sync()

## Delete vector index

In [0]:
# VS_INDEX_FULLNAME = "juan_dev.genai.amazon_reviews_mktpl_vsidx"
# vsc.delete_index(endpoint_name=VECTOR_SEARCH_ENDPOINT, index_name=VS_INDEX_FULLNAME)