Testing storage optimized indexes.

In [0]:
%pip install databricks-vectorsearch
%restart_python

In [0]:
DATABRICKS_TOKEN = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()
workspaceUrl = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiUrl().get()

In [0]:
vector_search_endpoint = "test_storage_optim"
source_table = "lucasbruand_catalog.ai_agent.knowledge_base"
id_col = "id"
content_col = "content"
embedding_model_endpoint = "databricks-gte-large-en"
source_table_with_embeddings = f"{source_table}_emb"

In [0]:
from databricks.vector_search.client import VectorSearchClient

# Initialize the Vector Search client
vsc = VectorSearchClient()

# Create the vector search index
index = vsc.create_delta_sync_index(    embedding_model_endpoint_name=embedding_model_endpoint,
    endpoint_name=vector_search_endpoint,
    source_table_name=source_table,
    index_name=f"{source_table}_index",
    pipeline_type="TRIGGERED",
    primary_key=id_col,
    embedding_source_column=content_col
)

print(f"Vector search index created: {source_table}_index")
print(f"Endpoint: {vector_search_endpoint}")

In [0]:
display(spark.sql(f"""
CREATE OR REPLACE TABLE {source_table_with_embeddings} AS
SELECT 
  {id_col},
  product_name,
  title,
  {content_col},
  doc_uri,
  ai_query(
    '{embedding_model_endpoint}',
    content
  ) AS embedding
FROM {source_table}
"""))

In [0]:
display(spark.sql(f"select * from {source_table_with_embeddings}"))


In [0]:
display(spark.sql(f""" 
          select embedding from {source_table_with_embeddings} limit 1
          """))

In [0]:
result = spark.sql(f"""
    SELECT 
        size(embedding) as embedding_size
    FROM {source_table_with_embeddings}
    LIMIT 1
""")

display(result)

In [0]:
embedding_size = result.collect()[0][0]
print(f"Embedding dimension: {embedding_size}")

In [0]:
from databricks.vector_search.client import VectorSearchClient

# Initialize the Vector Search client
vsc = VectorSearchClient()

# Create the vector search index with precomputed embeddings
index_precomputed = vsc.create_delta_sync_index(
    endpoint_name=vector_search_endpoint,
    source_table_name=source_table_with_embeddings,
    index_name=f"{source_table_with_embeddings}_index",
    pipeline_type="TRIGGERED",
    primary_key=id_col,
    embedding_vector_column="embedding",
    embedding_dimension=embedding_size
)

print(f"Vector search index created: {source_table_with_embeddings}_index")
print(f"Endpoint: {vector_search_endpoint}")
print(f"Using precomputed embeddings from column: embedding")