
# Vector Search Indexの作成

In [0]:
%pip install -qq databricks-vectorsearch

%restart_python

In [0]:
dbutils.widgets.text("vs_endpoint", "default_vs_endpoint")
vs_endpoint = dbutils.widgets.get("vs_endpoint")

dbutils.widgets.text("catalog", "workspace")
catalog = dbutils.widgets.get("catalog")

dbutils.widgets.text("schema", "")
schema = dbutils.widgets.get("schema")

dbutils.widgets.text("source_table", "")
source_table = dbutils.widgets.get("source_table")

dbutils.widgets.text("index_name", "")
index_name = dbutils.widgets.get("index_name")

print(dbutils.widgets.getAll())


## Vector Search Endpointの作成

In [0]:
from databricks.vector_search.client import VectorSearchClient

try:
    client = VectorSearchClient()
    client.create_endpoint_and_wait(
        name=vs_endpoint,
        endpoint_type="STANDARD",
        verbose=True,
    )
# エラーハンドリングは手抜きです。きちんと実装しましょう。
except Exception as e:
    # 既にエンドポイントが存在する場合
    if "already exists" in str(e):
        pass
    else:
        print(e)
        pass


## Vector Search Indexの作成

In [0]:

import time

def exists_index(client, index_name):
    indexes = client.list_indexes(vs_endpoint)
    vector_indexes = indexes.get("vector_indexes")
    if not vector_indexes:
        return False

    for index in vector_indexes:        
        if index.get("name") == index_name:
            return True
    return False

def wait_for_deletion(client, index_name):
    for _ in range(10):
        if not exists_index(client, index_name):
            return
        time.sleep(2)
    raise Exception("Timeout waiting for index deletion")

try:
    client = VectorSearchClient()

    # VectorIndexがすでに存在している場合は削除
    if exists_index(client, f"{catalog}.{schema}.{index_name}"):
        print(f"index {catalog}.{schema}.{index_name} is already existing.")
        print(f"Delete current index: {catalog}.{schema}.{index_name}")
        client.delete_index(
            index_name=f"{catalog}.{schema}.{index_name}",
            endpoint_name=vs_endpoint,
        )
        wait_for_deletion(client, f"{catalog}.{schema}.{index_name}")
        print(f"Deleted index {catalog}.{schema}.{index_name}")


    # Indexの作成
    index = client.create_delta_sync_index_and_wait(
        endpoint_name=vs_endpoint,
        source_table_name=f"{catalog}.{schema}.{source_table}",
        index_name=f"{catalog}.{schema}.{index_name}",
        pipeline_type="TRIGGERED",
        primary_key="id",
        embedding_model_endpoint_name="databricks-bge-large-en",
        embedding_source_column="en_summarized_body",
        columns_to_sync=[
            "id",
            "body_head_8000",
            "created_at",
            "updated_at",
            "title",
            "url",
            "organization_url_name",
            "summarized_body",
            "en_summarized_body",
            "author_id",
            "author_name",
            "tag_names",
            "imported_at",
        ],
    )
except Exception as e:
    # 手抜き。ちゃんと実装すること。
    print(e)
    pass