In [0]:
%pip install -U -qqqq mlflow langgraph==0.3.4 databricks-langchain databricks-agents uv  databricks-vectorsearch --upgrade langgraph
dbutils.library.restartPython()

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
%restart_python

In [0]:
%run ./00-init-requirements

# Create Vector Search Indexes for Workday CRM Data

Create the indexes via python SDK. There are two steps involved:

1. Create vector search endpoint (one endpoint can serve multiple vector search indexes)
2. Create vector search indexes for different data types:
   - Email communications
   - Meeting notes
   - Customer feedback
   - Employee records
   - Job requisitions

In [0]:
from pyspark.sql.functions import expr, col, explode, monotonically_increasing_id

# Parse PDF documents from volumes using ai_parse_document
customer_feedback_parsed = (
    spark.read.format("binaryFile")
    .load(f"/Volumes/{catalog_name}/{schema_name}/workday_unstructure_data/customer_feedback/")
    .withColumn("parsed", expr("ai_parse_document(content)"))
    .select(
        "path",
        expr("parsed:document:pages").alias("pages"),
        expr("parsed:error_status").alias("error_status")
    )
)

meeting_notes_parsed = (
    spark.read.format("binaryFile")
    .load(f"/Volumes/{catalog_name}/{schema_name}/workday_unstructure_data/meeting_notes/")
    .withColumn("parsed", expr("ai_parse_document(content)"))
    .select(
        "path",
        expr("parsed:document:pages").alias("pages"),
        expr("parsed:error_status").alias("error_status")
    )
)

email_communications_parsed = (
    spark.read.format("binaryFile")
    .load(f"/Volumes/{catalog_name}/{schema_name}/workday_unstructure_data/email_communications/")
    .withColumn("parsed", expr("ai_parse_document(content)"))
    .select(
        "path",
        expr("parsed:document:pages").alias("pages"),
        expr("parsed:error_status").alias("error_status")
    )
)

In [0]:
customer_feedback_parsed.createOrReplaceTempView("vf_customer_feedback")
meeting_notes_parsed.createOrReplaceTempView("vf_meeting_notes")
email_communications_parsed.createOrReplaceTempView("vf_email_communications")

def create_kb_table_from_parsed(view_name, kb_table_fqn):
    """Create knowledge base table from ai_parse_document output using SQL"""
    # Drop and recreate table
    spark.sql(f"DROP TABLE IF EXISTS {kb_table_fqn}")
    
    spark.sql(f"""
        CREATE TABLE {kb_table_fqn} (
            id BIGINT GENERATED ALWAYS AS IDENTITY,
            content STRING,
            doc_uri STRING
        ) TBLPROPERTIES (delta.enableChangeDataFeed = true)
    """)
    
    # Insert data directly from temp view
    spark.sql(f"""
        INSERT INTO {kb_table_fqn} (content, doc_uri)
        SELECT
            page.content,
            path AS doc_uri
        FROM {view_name}
        LATERAL VIEW EXPLODE(
            FROM_JSON(TO_JSON(pages), 'array<struct<content:string>>')
        ) AS page
        WHERE page.content IS NOT NULL
    """)
    
    record_count = spark.table(kb_table_fqn).count()
    print(f"✅ {kb_table_fqn} created with {record_count} records")

# Create knowledge base tables from parsed documents
create_kb_table_from_parsed(
    "vf_customer_feedback",
    f"{catalog_name}.{schema_name}.customer_feedback_knowledge_base"
)

create_kb_table_from_parsed(
    "vf_meeting_notes",
    f"{catalog_name}.{schema_name}.meeting_notes_knowledge_base"
)

create_kb_table_from_parsed(
    "vf_email_communications",
    f"{catalog_name}.{schema_name}.email_communications_knowledge_base"
)

✅ smriti_sridhar.workday_demos.customer_feedback_knowledge_base created with 0 records
✅ smriti_sridhar.workday_demos.meeting_notes_knowledge_base created with 0 records
✅ smriti_sridhar.workday_demos.email_communications_knowledge_base created with 0 records


In [0]:
vs_endpoint_name = f"workday-sales-endpoint-{user_name}"

# Email communications index
email_vs_index_name = f"{catalog_name}.{schema_name}.email_communications_index"
email_vs_input_table = f"{catalog_name}.{schema_name}.email_communications_knowledge_base"

# Meeting notes index
notes_vs_index_name = f"{catalog_name}.{schema_name}.meeting_notes_index"
notes_vs_input_table = f"{catalog_name}.{schema_name}.meeting_notes_knowledge_base"

# Customer feedback index
feedback_vs_index_name = f"{catalog_name}.{schema_name}.customer_feedback_index"
feedback_vs_input_table = f"{catalog_name}.{schema_name}.customer_feedback_knowledge_base"

In [0]:
from databricks.vector_search.client import VectorSearchClient

# Create vector search endpoint
client = VectorSearchClient(disable_notice=True)

try:
    client.create_endpoint(
        name=vs_endpoint_name,
        endpoint_type="STANDARD"
    )
    print(f"✅ Vector search endpoint '{vs_endpoint_name}' created successfully")
except Exception as e:
    print(f"ℹ️  Vector search endpoint '{vs_endpoint_name}' already exists")

✅ Vector search endpoint 'workday-sales-endpoint-smriti_sridhar' created successfully


In [0]:
import time

def create_vs_index(endpoint_name, source_table, index_name):
    """Create a vector search index with error handling"""
    try:
        index = client.create_delta_sync_index(
            endpoint_name=endpoint_name,
            source_table_name=source_table,
            index_name=index_name,
            pipeline_type="TRIGGERED",
            primary_key="id",
            embedding_source_column="content",
            embedding_model_endpoint_name="databricks-bge-large-en"
        )
        print(f"✅ {index_name} created successfully")
        return index
    except Exception as e:
        if "already exists" in str(e).lower():
            print(f"ℹ️  {index_name} already exists")
            # return client.get_index(endpoint_name=endpoint_name, index_name=index_name)
        else:
            print(f"❌ Error creating {index_name}: {str(e)}")
            return None

# Create all three indexes
email_index = create_vs_index(
    vs_endpoint_name, 
    email_vs_input_table, 
    email_vs_index_name,
)

notes_index = create_vs_index(
    vs_endpoint_name, 
    notes_vs_input_table, 
    notes_vs_index_name,
)

feedback_index = create_vs_index(
    vs_endpoint_name, 
    feedback_vs_input_table, 
    feedback_vs_index_name,
)

✅ smriti_sridhar.workday_demos.email_communications_index created successfully
✅ smriti_sridhar.workday_demos.meeting_notes_index created successfully
✅ smriti_sridhar.workday_demos.customer_feedback_index created successfully


In [0]:
print("Syncing vector search indexes...")

for index_name in [
    (email_vs_index_name),
    (notes_vs_index_name),
    (feedback_vs_index_name)
]:
    try:
        client.get_index(endpoint_name=vs_endpoint_name, index_name=index_name).sync()
        print(f"✅ {index_name} index synced")
    except Exception as e:
        print(f"⚠️  Could not sync {index_name} index: {str(e)}")

print("\n🎉 Vector search setup complete!")

Syncing vector search indexes...
✅ smriti_sridhar.workday_demos.email_communications_index index synced
✅ smriti_sridhar.workday_demos.meeting_notes_index index synced
✅ smriti_sridhar.workday_demos.customer_feedback_index index synced

🎉 Vector search setup complete!
