# Generate Vector Search Index

In [None]:
# If running on a local IDE
from databricks.connect import DatabricksSession

spark = DatabricksSession.builder.remote(serverless=True).getOrCreate()

In [None]:
# Set variables
CATALOG = "users"  # TODO: your catalog name
SCHEMA = "david_huang"  # TODO: your schema name
MERCHANT_TABLE = "ner_demo_merchant_entities"  # TODO: your merchant date
VS_ENDPOINT = "one-env-shared-endpoint-0"  # TODO: your VS endpoint
VS_INDEX = "ner_demo_merchant_index"  # TODO: your VS index name

In [None]:
# Ensure we are using the correct catalog and schema
assert CATALOG and SCHEMA, "Please set CATALOG and SCHEMA"
spark.sql(f"USE CATALOG {CATALOG}")
spark.sql(f"USE SCHEMA {SCHEMA}")

# Peek at the merchant entities table
display(spark.sql(f"SELECT * FROM {CATALOG}.{SCHEMA}.{MERCHANT_TABLE} LIMIT 5"))

## Create Vector Search index (delta sync)

In [None]:
# Enable Change Data Feed (CDF) on the source Delta table
table_fqn = f"{CATALOG}.{SCHEMA}.{MERCHANT_TABLE}"
spark.sql(
    f"ALTER TABLE {CATALOG}.{SCHEMA}.{MERCHANT_TABLE} SET TBLPROPERTIES (delta.enableChangeDataFeed = true)"
)
print(f"Enabled CDF on {table_fqn}")

# Verify property
display(
    spark.sql(
        f"SHOW TBLPROPERTIES {CATALOG}.{SCHEMA}.{MERCHANT_TABLE}('delta.enableChangeDataFeed')"
    )
)

In [None]:
# Create a Vector Search index that auto-embeds `merchant_name`
from databricks.vector_search.client import VectorSearchClient

assert VS_INDEX and VS_INDEX != "None", "Please set VS_INDEX to your desired index name"

index_name = f"{CATALOG}.{SCHEMA}.{VS_INDEX}"
source_table = f"{CATALOG}.{SCHEMA}.{MERCHANT_TABLE}"

vsc = VectorSearchClient(
    personal_access_token=dbutils.secrets.get(scope="dhuang", key="databricks-token")
)

# Choose an embedding model endpoint available in your workspace.
EMBEDDING_MODEL_ENDPOINT = "databricks-gte-large-en"

# Create the index if it does not exist
try:
    vsc.create_delta_sync_index(
        endpoint_name=VS_ENDPOINT,
        index_name=index_name,
        source_table_name=source_table,
        pipeline_type="TRIGGERED",  # manual syncs; change to CONTINUOUS if desired
        primary_key="entity_id",
        embedding_source_column="merchant_name",
        embedding_model_endpoint_name=EMBEDDING_MODEL_ENDPOINT,
    )
    print(f"Created Vector Search index: {index_name}")
except Exception as e:
    msg = str(e).lower()
    if "already exists" in msg or "resource_already_exists" in msg or "409" in msg:
        print(f"Index already exists: {index_name}")
    else:
        raise

In [None]:
# Fetch handle to the index
idx = vsc.get_index(endpoint_name=VS_ENDPOINT, index_name=index_name)
display(idx.describe())

## Sync index

In [None]:
# Trigger a sync to ingest data and generate embeddings
sync_info = idx.sync()
print("Triggered sync:", sync_info)

# Optional: Poll for completion (simple loop)
import time

for _ in range(60):  # up to ~5 minutes
    status = idx.describe()
    state = status.get("status", {}).get("detailed_state") or status.get("status")
    print("Index state:", state)
    if isinstance(state, str) and ("ONLINE" in state or "READY" in state):
        break
    time.sleep(5)

display(idx.describe())