In [0]:
%pip install --upgrade langchain databricks_langchain langchain-community pypdf databricks-vectorsearch --quiet

In [0]:
dbutils.library.restartPython()


## **Document Embedding & Semantic Search Workflow**

1. **Load Documents** - Import PDFs or text sources into your system.
2. **Split into Chunks** - Break content into manageable, overlapping pieces to maintain context.
3. **Generate Embeddings** - Convert chunks into vector representations using a machine learning model.
4. **Index for Search** - Store vectors in a searchable index for semantic queries.
5. **Query & Retrieve** - Use natural language questions to find relevant content efficiently.

**Reasons:** Access to an OPC UA knowledge base, easy troubleshooting via Natural Language, and AI-assisted search. Setting up advanced configurations for OPC UA Servers can be tricky so being able to chat with a manual or manuals would be great.

In [0]:
from langchain_community.document_loaders.pdf import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from databricks_langchain import DatabricksVectorSearch, DatabricksEmbeddings
from databricks.vector_search.client import VectorSearchClient
import pandas as pd
from pyspark.sql import SparkSession


loader = PyPDFLoader("/Volumes/workspace/default/opc_ua_manual_siemens/OPCWCCUenUS_en-US.pdf")
docs = loader.load()

splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = splitter.split_documents(docs)


data = []
for i, chunk in enumerate(chunks):
    data.append({
        "id": str(i),
        "text": chunk.page_content,
        "source": chunk.metadata.get("source", ""),
        "page": chunk.metadata.get("page", 0)
    })

df = spark.createDataFrame(pd.DataFrame(data))

table_name = "workspace.default.opcua_manual_chunks"
df.write.format("delta").mode("overwrite").saveAsTable(table_name)

print(f"Delta table '{table_name}' created with {len(chunks)} chunks")

spark.sql(f"""
  ALTER TABLE {table_name}
  SET TBLPROPERTIES (delta.enableChangeDataFeed = true)
""")

client = VectorSearchClient(disable_notice=True)

endpoint_name = "opcua_manual_siemens_vector_search_endpoint"
index_name = "workspace.default.opcua_manual_index"

try:
    index = client.create_delta_sync_index(
        endpoint_name=endpoint_name,
        index_name=index_name,
        source_table_name=table_name,
        pipeline_type="TRIGGERED",  # or "CONTINUOUS" for auto-updates
        primary_key="id",
        embedding_source_column="text",
        embedding_model_endpoint_name="databricks-gte-large-en"
    )
    print(f"Delta Sync Index '{index_name}' created successfully")
except Exception as e:
    print(f"Error creating index: {e}")


import time
print("Waiting for index to sync...")
time.sleep(30)  # Wait for initial sync


vector_store = DatabricksVectorSearch(
    endpoint=endpoint_name,
    index_name=index_name
)

print("Vector store is ready for queries...")

In [0]:
# -------------------------------
# Test a similarity search
# -------------------------------

results = vector_store.similarity_search("workspace.default.opcua_manual_index", k=3)
for i, doc in enumerate(results):
    print(f"\n--- Result {i+1} ---")
    print(doc.page_content[:200])