## Databricks Mosaic AI Vector Index Creation

![vector_index](./Assets/vector_index.png)

### Installing Utilities and Libraries

In [None]:
%pip install databricks-vectorsearch==0.63

### Restarting the Python Kernel

In [None]:
dbutils.library.restartPython()

### Enabling CDC (Change Data Capture) on RAG Table in Unity Catalog

In [None]:
# Enable change data feed for the existing Delta table
spark.sql("""
ALTER TABLE RAG.final_rag_dataset
SET TBLPROPERTIES (delta.enableChangeDataFeed = true)
""")
     

### Creating Vector Index and Endpoint

In [None]:
from databricks.vector_search.client import VectorSearchClient

vector_client = VectorSearchClient()

# To recreate the endpoint after deletion:
vector_client.create_endpoint(
     name="vector_search_endpoint",
     endpoint_type="STANDARD"
 )


index = vector_client.create_delta_sync_index(
   endpoint_name="vector_search_endpoint",
   source_table_name="YOUR_UC_NAME.SCHEMA.TABLE_NAME",
   index_name="YOUR_UC_NAME.SCHEMA.rag_vector_index",
   pipeline_type="TRIGGERED",
   primary_key="id",
   embedding_source_column="chunk",
   embedding_model_endpoint_name="databricks-gte-large-en"
  )

### Triggering our Index - Information Retrieval

#### Dense Vector Similarity Search

In [None]:
import json

user_question = "CarbonOps ESG Intelligence Model"

results_dict = index.similarity_search(
            query_text = user_question,
            columns = ["content_path", "chunk"],
            num_results=10,
          )

for content in results_dict['result']['data_array']:
    print(json.dumps(content, indent=2, ensure_ascii=False))

#### Hybrid Search

In [None]:
user_question = "CarbonOps ESG Intelligence Model"

results_dict = index.similarity_search(
            query_text = "{user_question}",
            columns = ["content_path","chunk"],
            num_results=10,
            query_type="hybrid"
          )

for content in results_dict['result']['data_array']:
    print(json.dumps(content, indent=2, ensure_ascii=False))

#### ANN (Approximate Nearest Neighbour) Search

In [None]:
user_question = "CarbonOps ESG Intelligence Model"

results_dict = index.similarity_search(
            query_text = "{user_question}",
            columns = ["content_path", "chunk"],
            num_results=10,
            query_type="ANN"
          )

for content in results_dict['result']['data_array']:
    print(json.dumps(content, indent=2, ensure_ascii=False))

### Searching with Semantic Reranker

In [None]:
from databricks.vector_search.reranker import DatabricksReranker

user_question = "carbonops Sustainable Development Goals Mappings"

results_dict = index.similarity_search(
            query_text = "{user_question}",
            columns = ["content_path", "chunk"],
            num_results=10,
            query_type="hybrid",
            reranker = DatabricksReranker(columns_to_rerank=["chunk", "content_path"])
          )

for content in results_dict['result']['data_array']:
    print(json.dumps(content, indent=2, ensure_ascii=False))