In [0]:
%pip install -U -qqqq mlflow langgraph==0.3.4 databricks-langchain databricks-agents uv  databricks-vectorsearch --upgrade langgraph
dbutils.library.restartPython()

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
%run ./00-init-requirements

Catalog created andrea_tardif_v2
Schema created andrea_tardif_v2.workday_demos
Volume created /Volumes/andrea_tardif_v2/workday_demos/workday_unstructure_data


# Create Vector Search Indexes

Create the indexes via python SDK. There are two steps involved:

1. Create vector search endpoint (one endpoint can serve multiple vector search indexes)
2. Create vector search indexes for different data types:
   - Email communications
   - Meeting notes
   - Customer feedback
   - Employee records
   - Job requisitions

In [0]:
from pyspark.sql.functions import expr, col, explode, monotonically_increasing_id

# Parse PDF documents from volumes using ai_parse_document
customer_feedback_parsed = (
    spark.read.format("binaryFile")
    .load(f"/Volumes/{catalog_name}/{schema_name}/workday_unstructure_data/customer_feedback/")
    .withColumn("parsed", expr("ai_parse_document(content, map('version', '2.0'))"))
    .withColumn("content", expr("array_join(transform(parsed:document.elements::ARRAY<STRUCT<content:STRING>>, x -> x.content), '\n')"))
    .select(
        "content",
        expr("parsed:document").alias("document"),
        expr("parsed:document:pages").alias("pages"),
        expr("parsed:error_status").alias("error_status"),
        col("path").alias("doc_uri")
    )
)

meeting_notes_parsed = (
    spark.read.format("binaryFile")
    .load(f"/Volumes/{catalog_name}/{schema_name}/workday_unstructure_data/meeting_notes/")
    .withColumn("parsed", expr("ai_parse_document(content, map('version', '2.0'))"))
    .withColumn("content", expr("array_join(transform(parsed:document.elements::ARRAY<STRUCT<content:STRING>>, x -> x.content), '\n')"))
    .select(
        "content",
        expr("parsed:document").alias("document"),
        expr("parsed:document:pages").alias("pages"),
        expr("parsed:error_status").alias("error_status"),
        col("path").alias("doc_uri")
    )
)

email_communications_parsed = (
    spark.read.format("binaryFile")
    .load(f"/Volumes/{catalog_name}/{schema_name}/workday_unstructure_data/email_communications/")
    .withColumn("parsed", expr("ai_parse_document(content, map('version', '2.0'))"))
    .withColumn("content", expr("array_join(transform(parsed:document.elements::ARRAY<STRUCT<content:STRING>>, x -> x.content), '\n')"))
    .select(
        "content",
        expr("parsed:document").alias("document"),
        expr("parsed:document:pages").alias("pages"),
        expr("parsed:error_status").alias("error_status"),
        col("path").alias("doc_uri")
    )
)

In [0]:
customer_feedback_parsed.createOrReplaceTempView("vf_customer_feedback")
meeting_notes_parsed.createOrReplaceTempView("vf_meeting_notes")
email_communications_parsed.createOrReplaceTempView("vf_email_communications")

def create_kb_table_from_parsed(view_name, kb_table_fqn):
    """Create knowledge base table from ai_parse_document output using SQL"""
    # Drop and recreate table
    spark.sql(f"DROP TABLE IF EXISTS {kb_table_fqn}")
    
    spark.sql(f"""
        CREATE TABLE {kb_table_fqn} (
            id BIGINT GENERATED ALWAYS AS IDENTITY,
            content STRING,
            doc_uri STRING
        ) TBLPROPERTIES (delta.enableChangeDataFeed = true)
    """)
    
    # Insert data directly from temp view
    spark.sql(f"""
        INSERT INTO {kb_table_fqn} (content, doc_uri)
        SELECT
            content,
            doc_uri
        FROM {view_name}
        WHERE content IS NOT NULL
    """)
    
    record_count = spark.table(kb_table_fqn).count()
    print(f"‚úÖ {kb_table_fqn} created with {record_count} records")

# Create knowledge base tables from parsed documents
create_kb_table_from_parsed(
    "vf_customer_feedback",
    f"{catalog_name}.{schema_name}.customer_feedback_knowledge_base"
)

create_kb_table_from_parsed(
    "vf_meeting_notes",
    f"{catalog_name}.{schema_name}.meeting_notes_knowledge_base"
)

create_kb_table_from_parsed(
    "vf_email_communications",
    f"{catalog_name}.{schema_name}.email_communications_knowledge_base"
)

‚úÖ andrea_tardif_v2.workday_demos.customer_feedback_knowledge_base created with 25 records
‚úÖ andrea_tardif_v2.workday_demos.meeting_notes_knowledge_base created with 25 records
‚úÖ andrea_tardif_v2.workday_demos.email_communications_knowledge_base created with 25 records


In [0]:
vs_endpoint_name = f"sales-endpoint-{catalog_name}"

# Email communications index
email_vs_index_name = f"{catalog_name}.{schema_name}.email_communications_index"
email_vs_input_table = f"{catalog_name}.{schema_name}.email_communications_knowledge_base"

# Meeting notes index
notes_vs_index_name = f"{catalog_name}.{schema_name}.meeting_notes_index"
notes_vs_input_table = f"{catalog_name}.{schema_name}.meeting_notes_knowledge_base"

# Customer feedback index
feedback_vs_index_name = f"{catalog_name}.{schema_name}.customer_feedback_index"
feedback_vs_input_table = f"{catalog_name}.{schema_name}.customer_feedback_knowledge_base"

In [0]:
from databricks.vector_search.client import VectorSearchClient

# Create vector search endpoint
client = VectorSearchClient(disable_notice=True)

try:
    client.delete_endpoint(vs_endpoint_name)
    print(f"‚ÑπÔ∏è  Vector search endpoint '{vs_endpoint_name}' deleted")
    
except Exception as e:
    print(f"‚ÑπÔ∏è  Vector search endpoint '{vs_endpoint_name}' did not exist or could not be deleted")

    client.create_endpoint(
        name=vs_endpoint_name,
        endpoint_type="STANDARD"
    )
    print(f"‚úÖ Vector search endpoint '{vs_endpoint_name}' created successfully")

‚ÑπÔ∏è  Vector search endpoint 'sales-endpoint-andrea_tardif_v2' did not exist or could not be deleted
‚úÖ Vector search endpoint 'sales-endpoint-andrea_tardif_v2' created successfully


In [0]:
import time

def create_vs_index(endpoint_name, source_table, index_name):
    """Create a vector search index with error handling"""
    try:
        index = client.create_delta_sync_index(
            endpoint_name=endpoint_name,
            source_table_name=source_table,
            index_name=index_name,
            pipeline_type="TRIGGERED",
            primary_key="id",
            embedding_source_column="content",
            embedding_model_endpoint_name="databricks-bge-large-en"
        )
        print(f"‚úÖ {index_name} created successfully")
        return index
    
    except Exception as e:
        if "already exists" in str(e).lower():
            print(f"‚ÑπÔ∏è  {index_name} already exists")

        else:
            print(f"‚ùå Error creating {index_name}: {str(e)}")
            return None

# Create all three indexes
email_index = create_vs_index(
    vs_endpoint_name, 
    email_vs_input_table, 
    email_vs_index_name,
)

notes_index = create_vs_index(
    vs_endpoint_name, 
    notes_vs_input_table, 
    notes_vs_index_name,
)

feedback_index = create_vs_index(
    vs_endpoint_name, 
    feedback_vs_input_table, 
    feedback_vs_index_name,
)

‚úÖ andrea_tardif_v2.workday_demos.email_communications_index created successfully
‚úÖ andrea_tardif_v2.workday_demos.meeting_notes_index created successfully
‚úÖ andrea_tardif_v2.workday_demos.customer_feedback_index created successfully


In [0]:
print("Syncing vector search indexes...")

for index_name in [
    (email_vs_index_name),
    (notes_vs_index_name),
    (feedback_vs_index_name)
]:
    try:
        client.get_index(endpoint_name=vs_endpoint_name, index_name=index_name).sync()
        print(f"‚úÖ {index_name} index synced")
    except Exception as e:
        print(f"‚ö†Ô∏è  Could not sync {index_name} index: {str(e)}")

print("\nüéâ Vector search setup complete!")

Syncing vector search indexes...
‚ö†Ô∏è  Could not sync andrea_tardif_v2.workday_demos.email_communications_index index: Response content b'{"error_code":"BAD_REQUEST","message":"Vector search endpoint sales-endpoint-andrea_tardif_v2 is not ready yet.","details":[{"@type":"type.googleapis.com/google.rpc.RequestInfo","request_id":"0f5173aa-fd15-4e6e-93f0-0f846ed4ee06","serving_data":""}]}', status_code 400
‚ö†Ô∏è  Could not sync andrea_tardif_v2.workday_demos.meeting_notes_index index: Response content b'{"error_code":"BAD_REQUEST","message":"Vector search endpoint sales-endpoint-andrea_tardif_v2 is not ready yet.","details":[{"@type":"type.googleapis.com/google.rpc.RequestInfo","request_id":"9013cc41-d58b-4658-82df-10d7a1f2f239","serving_data":""}]}', status_code 400
‚ö†Ô∏è  Could not sync andrea_tardif_v2.workday_demos.customer_feedback_index index: Response content b'{"error_code":"BAD_REQUEST","message":"Vector search endpoint sales-endpoint-andrea_tardif_v2 is not ready yet.","deta