### Evaluating RAG Model with Databricks Evals and Traces

![rag_eval](./Assets/rag_eval.png)

### Installing Libraries and Utilities

In [None]:
%pip install databricks-vectorsearch==0.63 openai==2.15.0 mlflow==3.8.1 databricks-agents==1.9.2

### Restarting our Python Kernel

In [None]:
dbutils.library.restartPython()

### Setting Up MLflow Tracing and Experiment

In [None]:
import mlflow
import os
from openai import OpenAI

# Enable MLflow's autologging to instrument your application with Tracing
mlflow.openai.autolog()

# Set up MLflow tracking to Databricks
mlflow.set_tracking_uri("databricks")
mlflow.set_experiment("/Users/YOUR_USER_ID_GOES_HERE/rag-app")

### Creating RAG Model with MLflow Tracing Enabled

In [None]:
import mlflow
from mlflow import pyfunc
from openai import OpenAI

class RAGModel(pyfunc.PythonModel):
    def __init__(self, vector_index):
        self.vector_index = vector_index
    
    @mlflow.trace(span_type="RETRIEVER")
    def retrieve(self, query):
          results_dict = self.vector_index.similarity_search(
            query_text = query,
            columns = ["id", "content_path", "chunk"],
            num_results=10
          )

          return results_dict
    
    @mlflow.trace
    def chatCompletionsAPI(self, user_query, supporting_knowledge):
        openai_client = OpenAI(
            api_key = "YOUR_DATABRICKS_ACCESS_TOKEN",
            base_url = "YOUR_DATABRICKS_WORKSPACE_HOSTNAME/serving-endpoints"
        )
        
        completion = openai_client.chat.completions.create(
            model = "databricks-claude-haiku-4-5",
            messages = [
                {
                    "role": "system",
                    "content": [
                        {
                            "type": "text",
                            "text": "You are a helpful assistant. You will be passed the user query and the supporting knowledge that can be used to answer the user_query"
                        }
                    ]
                },
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": f"user query : {user_query} and supporting knowledge: {supporting_knowledge}"
                        }
                    ]
                }
            ]
        )

        return completion.choices[0].message.content
    
    def predict(self, context, data):
          query = data["user_query"].iloc[0]
          text_data = self.retrieve(query)
          return self.chatCompletionsAPI(query, text_data)


### Fetching our Mosaic AI Vector Index from Unity Catalog

In [None]:
from databricks.vector_search.client import VectorSearchClient

vector_client = VectorSearchClient()

# Use fully qualified index name: catalog.schema.index_name
vector_index = vector_client.get_index(
    index_name="YOUR_UNITY_CATALOG_NAME.rag.rag_vector_index" # make sure this matches your vector index in Unity Catalog
)

### Saving our Model

In [None]:
test_model = RAGModel(vector_index=vector_index)

In [None]:
from mlflow.models import infer_signature
import pandas as pd

# Sample input
input_example = pd.DataFrame([
    {"user_query": "Hi How are you?"}
])

# Sample output (what your model actually returns)
output_example = pd.DataFrame([
    {
        "predictions": "I am good thank you!"
    }
])

# Infer full signature (input + output)
signature = infer_signature(input_example, output_example)

model_path = "rag-model-experimentation"

mlflow.pyfunc.save_model(path=model_path, python_model=test_model, signature=signature)

### Loading our Saved Model

In [None]:
# Load our custom model from the local artifact store
loaded_pyfunc_model = mlflow.pyfunc.load_model("rag-model-experimentation")

In [None]:
model_input = pd.DataFrame([{"user_query": "what is the carbonops ESG Intelligence Model? Give Citations too"}])

model_response = loaded_pyfunc_model.predict(model_input)

print(model_response)

### Simulating Production Traffic

In [None]:
test_requests = [
    {"user_query": "Tell me something about CarbonOps Global Remote Work Policy"},
    {"user_query": "Tell me something about CarbonOps Sustainable Development Goals Mapper"},
    {"user_query": "How does CarbonOps MARK framework help in ESG materiality Analysis"},
    {"user_query": "Comment on the Work Culture at CarbonOps"},
    {"user_query": "Explain Materiality Dimensions in European ESRS Reporting"}
]

# RUn requets and capture traces
print("Simulating production traffic........")
for req in test_requests:
    try:
        result = loaded_pyfunc_model.predict(pd.DataFrame([req]))
        print(f"Question: {req['user_query']} \n")
        print(f"Answer: {result} \n")
    except Exception as e:
        print(f"encountered error: {e}")

### Running Evaluation using MLflow

In [None]:
from mlflow.genai.scorers import (
    RetrievalGroundedness,
    RelevanceToQuery,
    Safety,
    Guidelines,
)
import pandas as pd
import mlflow
from mlflow.genai import datasets

EVAL_DATASET_NAME='YOUR_UNITY_CATALOG_NAME.SCHEMA.TABLE'

# Sync data to unity catalog.
eval_dataset = datasets.get_dataset(EVAL_DATASET_NAME)

# Create a wrapper function that matches the dataset input structure
def predict_wrapper(user_query, supporting_knowledge=None):
    """Wrapper to convert dataset inputs to model's expected format"""
    input_df = pd.DataFrame([{"user_query": user_query}])
    return loaded_pyfunc_model.predict(input_df)

eval_judges = [
    Guidelines(
        name="Conciseness",
        guidelines="The response should be concise and to the point"
    ),
    RelevanceToQuery(),
    Safety(),
]

eval_results = mlflow.genai.evaluate(
    data = eval_dataset,
    predict_fn=predict_wrapper,
    scorers = eval_judges
)