# Imports

In [0]:
%pip install unitycatalog-ai[databricks] unitycatalog-langchain[databricks] databricks-langchain==0.3.0 pypdf==5.0.1 faiss-cpu==1.9.0

In [0]:
dbutils.library.restartPython()

# Get Document

Load the contents of [The Highway Code](https://www.highwaycodeuk.co.uk/uploads/3/2/9/2/3292309/the-official-highway-code-with-annexes-uk-en-12-04.pdf) and chunk it up.

In [0]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [0]:
documents = []

loader = PyPDFLoader("https://www.highwaycodeuk.co.uk/uploads/3/2/9/2/3292309/the-official-highway-code-with-annexes-uk-en-12-04.pdf")

# Creates a list of Document objects
for page in loader.load():
    documents.append(page)

In [0]:
# Set up our text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=400,
    length_function=len,
    is_separator_regex=False,
)

In [0]:
split_docs = text_splitter.split_documents(documents)

# Create Vector Search

## Databricks Vector Search

In [0]:
from databricks.vector_search.client import VectorSearchClient

In [0]:
client = VectorSearchClient()

In [0]:
# Ensure our index source table exists
spark.sql(f"""
CREATE TABLE IF NOT EXISTS advancing_ai.datasets.highway_code (
    chunk_id STRING COMMENT 'document_id + chunk number' NOT NULL,
    document_id STRING COMMENT 'base64 encoded source' NOT NULL,
    document_source STRING COMMENT 'Source of the document' NOT NULL,
    document_title STRING COMMENT 'Document title' NOT NULL,
    total_pages STRING COMMENT 'Total pages of the document' NOT NULL,
    page STRING COMMENT 'Page number',
    page_label STRING COMMENT 'Page number label' NOT NULL,
    creation_date TIMESTAMP COMMENT 'Document creation date' NOT NULL,
    chunk_content STRING COMMENT 'Chunk text' NOT NULL
)
USING delta 
COMMENT 'Table to store document chunks and related metadata'
TBLPROPERTIES (delta.enableChangeDataFeed = true)
""")

In [0]:
# Create our index as a triggered delta sync table
index = client.create_delta_sync_index(
    endpoint_name="advancing_ai_vector_search",
    source_table_name="advancing_ai.datasets.highway_code",
    index_name="advancing_ai.datasets.highway_code_index",
    pipeline_type="TRIGGERED",
    primary_key="chunk_id",
    embedding_source_column="chunk_content",
    embedding_model_endpoint_name="text-embedding-ada-002"
)

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, TimestampType

highway_code_schema = StructType([
    StructField("chunk_id", StringType(), False),
    StructField("document_id", StringType(), False),
    StructField("document_source", StringType(), False),
    StructField("document_title", StringType(), False),
    StructField("total_pages", StringType(), False),
    StructField("page", StringType(), True),
    StructField("page_label", StringType(), False),
    StructField("creation_date", TimestampType(), False),
    StructField("chunk_content", StringType(), False)
])

In [0]:
import base64
from datetime import datetime

# Create a list documents structured with our schema
structured_docs = [
    {
        "chunk_id": base64.urlsafe_b64encode(bytes(f"{doc.metadata.get('source')}_{i}", "utf-8")).decode('utf-8'),
        "document_id": base64.urlsafe_b64encode(bytes(f"{doc.metadata.get('source')}", "utf-8")).decode('utf-8'),
        "document_source": doc.metadata.get("source"),
        "document_title": doc.metadata.get("title"),
        "total_pages": doc.metadata.get("total_pages"),
        "page": doc.metadata.get("page"),
        "page_label": doc.metadata.get("page_label"),
        "creation_date": datetime.fromisoformat(doc.metadata.get("creationdate")),
        "chunk_content": doc.page_content
    }
    for i, doc in enumerate(split_docs)
]

In [0]:
(
    spark
    .createDataFrame(structured_docs, highway_code_schema)
    .write
    .mode("overwrite")
    .saveAsTable("advancing_ai.datasets.highway_code")
)

In [0]:
%sql
-- Create our RAG tool in the Unity Catalog
CREATE OR REPLACE FUNCTION `advancing_ai`.`agentic-tools`.`search_highway_code` (
  query STRING
  COMMENT 'The query string for searching UK highway code legislation and guidance.'
) RETURNS TABLE
COMMENT 'Executes a search on UK highway code legislation and guidance to retrieve text documents most relevant to the input query.' RETURN

SELECT
  chunk_content,
  map('page', page_label, 'chunk_id', chunk_id) as metadata
FROM
  vector_search(
    index => 'advancing_ai.datasets.highway_code_index',
    query => query,
    num_results => 5
  )

In [0]:
# Have a look at our new function
client.get_function("advancing_ai.agentic-tools.search_highway_code")

## FAISS Vector Store

In [0]:
import faiss

from databricks_langchain import DatabricksEmbeddings
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

In [0]:
# Create a Langchain embeddings connection
embeddings = DatabricksEmbeddings(
    endpoint="text-embedding-ada-002",
    max_retries=10,
    retry_min_seconds=30,
    retry_max_seconds=120,
)

In [0]:
# Instantiate our FAISS with the vector size we are using (text-embedding-ada-002 is 1536)
index = faiss.IndexFlatL2(1536)

In [0]:
# Create our vector database in memory
db = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [0]:
# Add our docs to the db
doc_ids = db.add_documents(documents=split_docs)

In [0]:
# Do a test search to make sure it works!
db.max_marginal_relevance_search("What can I drive on a bridleway but not on a pavement?")

In [0]:
from langchain_core.tools import tool

# We can create this as a tool locally
@tool
def search(query: str) -> str:
    """
    Performs a similarity search on a collection of documents.
    Returns the most similar documents to the query.

    Args:
        query (str): The query to search for.

    Returns:
        List[str]: A list of the most similar documents.
    """
    return db.max_marginal_relevance_search(query)