In [0]:
%load_ext autoreload
%autoreload 2
# Enables autoreload; learn more at https://docs.databricks.com/en/files/workspace-modules.html#autoreload-for-python-modules
# To disable autoreload; run %autoreload 0

In [0]:
##################################################################################
# Agent Evaluation
# 
# Notebook that downloads an evaluation dataset and evaluates the model using
# llm-as-a-judge with the Databricks agent framework.
#
# Parameters:
# * uc_catalog (required)           - Name of the Unity Catalog 
# * schema (required)               - Name of the schema inside Unity Catalog 
# * eval_table (required)           - Name of the table containing the evaluation dataset
# * experiment (required)           - Name of the experiment to register the run under
# * registered_model (required)     - Name of the model registered in mlflow
# * model_version (required)        - Model verison to deploy
#
# Widgets:
# * Unity Catalog: Text widget to input the name of the Unity Catalog
# * Schema: Text widget to input the name of the database inside the Unity Catalog
# * Evaluation Table: Text widget to input the name of the table containing the evaluation dataset
# * Experiment: Text widget to input the name of the experiment to register the run under
# * Registered model name: Text widget to input the name of the model to register in mlflow
# * Model Vesion: Text widget to input the model version to deploy
#
# Usage:
# 1. Set the appropriate values for the widgets.
# 2. Run to evaluate your agent.
#
##################################################################################

In [0]:
%pip install -qqqq -r ../../agent_requirements.txt.tmpl

In [0]:
%pip freeze | grep databricks

In [0]:
# List of input args needed to run the notebook as a job.
# Provide them via DB widgets or notebook arguments.

# A Unity Catalog containing the model
dbutils.widgets.text(
    "uc_catalog",
    "ai_agent_stacks",
    label="Unity Catalog",
)
# Name of schema
dbutils.widgets.text(
    "schema",
    "ai_agent_ops",
    label="Schema",
)
# Name of evaluation table
dbutils.widgets.text(
    "eval_table",
    "databricks_documentation_eval",
    label="Evaluation dataset",
)
# Name of experiment to register under in mlflow
dbutils.widgets.text(
    "experiment",
    "agent_function_chatbot",
    label="Experiment name",
)
# Name of model registered in mlflow
dbutils.widgets.text(
    "registered_model",
    "agent_function_chatbot",
    label="Registered model name",
)
# Model version
dbutils.widgets.text(
    "model_version",
    "1",
    label="Model Version",
)

In [0]:
uc_catalog = dbutils.widgets.get("uc_catalog")
schema = dbutils.widgets.get("schema")
eval_table = dbutils.widgets.get("eval_table")
experiment = dbutils.widgets.get("experiment")
registered_model = dbutils.widgets.get("registered_model")
model_version = dbutils.widgets.get("model_version")

assert uc_catalog != "", "uc_catalog notebook parameter must be specified"
assert schema != "", "schema notebook parameter must be specified"
assert eval_table != "", "eval_table notebook parameter must be specified"
assert experiment != "", "experiment notebook parameter must be specified"
assert registered_model != "", "registered_model notebook parameter must be specified"
assert model_version != "", "model_version notebook parameter must be specified"

In [0]:
import os

notebook_path =  '/Workspace/' + os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get())
%cd $notebook_path
%cd ../evaluation

In [0]:
import pandas as pd

def get_reference_documentation(catalog, schema, table, spark):
    (spark.createDataFrame(pd.read_parquet('https://notebooks.databricks.com/demos/dbdemos-dataset/llm/databricks-documentation/databricks_doc_eval_set.parquet'))
    .write.mode('overwrite').saveAsTable(f"{catalog}.{schema}.{table}"))

    eval_df = spark.read.table(f"{catalog}.{schema}.{table}")

    return eval_df


In [0]:
# from evaluation import get_reference_documentation

eval_dataset = get_reference_documentation(uc_catalog, schema, eval_table, spark)

display(eval_dataset.head(10))

In [0]:
import databricks.agents
import re
import mlflow

# Workaround for serverless compatibility
mlflow.tracking._model_registry.utils._get_registry_uri_from_spark_session = lambda: "databricks-uc"

# Retrieve model info for run
client = mlflow.MlflowClient()

model_info = client.get_model_version(f"{uc_catalog}.{schema}.{registered_model}", model_version)

# Set Experiment
pattern = r"^(.*?)/agent_evaluation"
match = re.match(pattern, notebook_path)
if match:
    path = match.group(1)
else:
    path = None

# mlflow.set_experiment(f"{path}/{experiment}")

with mlflow.start_run():
    # Evaluate the logged model
    eval_results = mlflow.evaluate(
        data=eval_dataset.limit(10),
        model=f'runs:/{model_info.run_id}/model',
        model_type="databricks-agent"
    )