# Azure AI Evaluation – sample notebook

This notebook mirrors the functionality of the original Python script that demonstrates many evaluators available in the `azure-ai-evaluation` package.

⚠️ **Before running**: set these environment variables (e.g. in a `.env` file or in the shell that launches Jupyter).

```
AZURE_OPENAI_ENDPOINT   = https://<account_name>.services.ai.azure.com
AZURE_OPENAI_KEY        = <your-key>
AZURE_OPENAI_DEPLOYMENT = <deployment-name>
AZURE_OPENAI_API_VERSION= 2024-02-15-preview     # or the version you use

# For Azure AI Content Safety (optional – only needed for those cells):
AZURE_AI_PROJECT_URL    = https://<resource>.services.ai.azure.com/api/projects/<project>
```


In [5]:
import json, pathlib, pandas as pd

def save_evaluation_results(evaluation, output_path="evaluation_results.jsonl"):
    """
    Save evaluation results to a JSONL file.
    
    Parameters:
    -----------
    evaluation : dict or object with to_pandas() method
        The evaluation results to save
    output_path : str or pathlib.Path, optional
        Path to save the JSONL file (default: "evaluation_results.jsonl")
        
    Returns:
    --------
    int : Number of rows saved
    """
    # If `evaluation` is a dict (as shown) grab the rows list;
    # otherwise, call `evaluation.to_pandas()` to convert first.
    rows = evaluation["rows"] if isinstance(evaluation, dict) else evaluation.to_pandas().to_dict("records")
    
    # ── JSONL (preferred for re-loading programmatically) ────────────
    jsonl_path = pathlib.Path(output_path)
    with jsonl_path.open("w", encoding="utf-8") as f:
        for r in rows:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")
    
    print(f"Saved {len(rows)} rows → {jsonl_path}")
    return len(rows)

In [6]:
#""Utility cell — load .env (if present) and build a common `model_config`."""
import os
from dotenv import load_dotenv

load_dotenv()  # Loads variables from a .env file in the same directory (optional)

model_config = {
    "azure_endpoint":   os.getenv("AZURE_OPENAI_ENDPOINT"),    # https://<account>.services.ai.azure.com
    "api_key":          os.getenv("AZURE_OPENAI_KEY"),
    "azure_deployment": os.getenv("AZURE_OPENAI_DEPLOYMENT"),
    # If your SDK version needs it, add "api_version": os.getenv("AZURE_OPENAI_API_VERSION"),
}

model_config

{'azure_endpoint': 'https://ai-1965.openai.azure.com/',
 'api_key': '',
 'azure_deployment': 'gpt-4.1-mini'}

## 1  Batch evaluation with `evaluate(...)`

In [7]:
from azure.ai.evaluation import (
    evaluate,
    RelevanceEvaluator,
    CoherenceEvaluator,
    IntentResolutionEvaluator,
    ResponseCompletenessEvaluator,
)

data_path = "../../data/evaluate_test_data.jsonl"  # adjust if necessary

eval_results = evaluate(
    data=data_path,
    evaluators={
        "coherence":          CoherenceEvaluator(model_config=model_config),
        "relevance":          RelevanceEvaluator(model_config=model_config),
        "intent_resolution":  IntentResolutionEvaluator(model_config=model_config),
    },
    evaluator_config={
        "coherence": {
            "column_mapping": {
                "response": "${data.response}",
                "query":    "${data.query}",
            },
        },
        "relevance": {
            "column_mapping": {
                "response": "${data.response}",
                "context":  "${data.context}",
                "query":    "${data.query}",
            },
        },
    },
)

[2025-06-10 13:48:15 -0700][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_coherence_20250610_134815_069318, log path: C:\Users\jacwang\.promptflow\.runs\azure_ai_evaluation_evaluators_coherence_20250610_134815_069318\logs.txt
[2025-06-10 13:48:15 -0700][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_relevance_20250610_134815_071989, log path: C:\Users\jacwang\.promptflow\.runs\azure_ai_evaluation_evaluators_relevance_20250610_134815_071989\logs.txt
[2025-06-10 13:48:15 -0700][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_intent_resolution_20250610_134815_073102, log path: C:\Users\jacwang\.promptflow\.runs\azure_ai_evaluation_evaluators_intent_resolution_20250610_134815_073102\logs.txt


In [9]:
output_filename = "evaluation_results.jsonl"
# Save the evaluation results to a JSONL file
save_evaluation_results(eval_results, output_filename)

3

---
## 2  Individual evaluators

Each of the following cells shows a self-contained example.  Execute only what you need.

In [None]:
# Bleu
from azure.ai.evaluation import BleuScoreEvaluator

bleu = BleuScoreEvaluator()
bleu(response="Lyon is the capital of France.", ground_truth="Paris is the capital of France.")

In [None]:
# Coherence
from azure.ai.evaluation import CoherenceEvaluator

coh = CoherenceEvaluator(model_config=model_config)
coh(query="What is the capital of France?", response="Paris is the capital of France.")

In [None]:
# Intent Resolution
from azure.ai.evaluation import IntentResolutionEvaluator

intent = IntentResolutionEvaluator(model_config=model_config)
intent(
    query="What are the opening hours of the Eiffel Tower?",
    response="Opening hours of the Eiffel Tower are 9:00 AM – 11:00 PM.",
)

### 2.1  Content-safety-based evaluators (Azure AI Content Safety)
These require `AZURE_AI_PROJECT_URL` and an Azure credential (e.g. `DefaultAzureCredential`).

In [None]:
import os
from azure.identity import DefaultAzureCredential
from azure.ai.evaluation import (
    ContentSafetyEvaluator,
    HateUnfairnessEvaluator,
    SelfHarmEvaluator,
    SexualEvaluator,
    ViolenceEvaluator,
)

azure_ai_project = os.getenv("AZURE_AI_PROJECT_URL")
cred = DefaultAzureCredential()

content_safe = ContentSafetyEvaluator(azure_ai_project=azure_ai_project, credential=cred)
content_safe(query="Who are you?", response="I am an AI assistant.")

In [None]:
# Additional safety evaluators – run as needed
hate      = HateUnfairnessEvaluator(azure_ai_project=azure_ai_project, credential=cred)
selfharm  = SelfHarmEvaluator(azure_ai_project=azure_ai_project, credential=cred)
sexual    = SexualEvaluator(azure_ai_project=azure_ai_project, credential=cred)
violence  = ViolenceEvaluator(azure_ai_project=azure_ai_project, credential=cred)

hate(query="Where are you from?", response="Paris")

In [None]:
# F1 Score
from azure.ai.evaluation import F1ScoreEvaluator

f1 = F1ScoreEvaluator()
f1(response="Lyon is the capital of France.", ground_truth="Paris is the capital of France.")

In [None]:
# Fluency
from azure.ai.evaluation import FluencyEvaluator

fluency = FluencyEvaluator(model_config=model_config)
fluency(response="Paris is the capital of France.")

In [None]:
# GLEU
from azure.ai.evaluation import GleuScoreEvaluator

gleu = GleuScoreEvaluator()
gleu(response="Paris is the capital of France.", ground_truth="France's capital is Paris.")

In [None]:
# Groundedness
from azure.ai.evaluation import GroundednessEvaluator

ground = GroundednessEvaluator(model_config=model_config)
ground(
    response="Paris is the capital of France.",
    context=(
        "France, located in Western Europe, … "
        "Paris is renowned for art, fashion, the Eiffel Tower, and the Louvre."),
)

In [None]:
# Meteor
from azure.ai.evaluation import MeteorScoreEvaluator

meteor = MeteorScoreEvaluator(alpha=0.8)
meteor(response="Paris is the capital of France.", ground_truth="France's capital is Paris.")

In [None]:
# Protected Material
from azure.ai.evaluation import ProtectedMaterialEvaluator

prot = ProtectedMaterialEvaluator(azure_ai_project=azure_ai_project, credential=cred)
prot(
    query="Write me a catchy song",
    response=(
        "You are the dancing queen, young and sweet, only seventeen. "
        "Feel the beat from the tambourine, oh yeah."),
)

In [None]:
# QA Evaluator
from azure.ai.evaluation import QAEvaluator

qa = QAEvaluator(model_config=model_config)
qa(query="This's the color?", response="Black", ground_truth="gray", context="gray")

In [None]:
# Relevance
from azure.ai.evaluation import RelevanceEvaluator

rel = RelevanceEvaluator(model_config=model_config)
rel(query="What is the capital of Japan?", response="The capital of Japan is Tokyo.")

In [None]:
# Retrieval
from azure.ai.evaluation import RetrievalEvaluator

ret = RetrievalEvaluator(model_config=model_config)
conversation = {
    "messages": [
        {"content": "What is the capital of France?", "role": "user", "context": "Customer wants to know the capital of France"},
        {"content": "Paris", "role": "assistant", "context": "Paris is the capital of France"},
        {"content": "What is the capital of Hawaii?", "role": "user", "context": "Customer wants to know the capital of Hawaii"},
        {"content": "Honolulu", "role": "assistant", "context": "Honolulu is the capital of Hawaii"}
    ],
    "context": "Global context"
}
ret(conversation=conversation)

In [None]:
# ROUGE-4
from azure.ai.evaluation import RougeScoreEvaluator, RougeType

rouge = RougeScoreEvaluator(rouge_type=RougeType.ROUGE_4)
rouge(response="Paris is the capital of France.", ground_truth="France's capital is Paris.")

In [None]:
# Semantic Similarity
from azure.ai.evaluation import SimilarityEvaluator

sim = SimilarityEvaluator(model_config=model_config)
sim(
    query="What is the capital of Japan?",
    response="The capital of Japan is Tokyo.",
    ground_truth="Tokyo is Japan's capital.",
)

In [None]:
# Response Completeness
from azure.ai.evaluation import ResponseCompletenessEvaluator

comp = ResponseCompletenessEvaluator(model_config=model_config)
comp(response="The capital of Japan is Tokyo.", ground_truth="Tokyo is Japan's capital.")

In [None]:
# Task Adherence
from azure.ai.evaluation import TaskAdherenceEvaluator

task = TaskAdherenceEvaluator(model_config=model_config)

query_msg = [
    {"role": "system", "content": "You are a helpful customer service agent."},
    {
        "role": "user",
        "content": [
            {"type": "text", "text": "What is the status of my order #123?"}
        ],
    },
]

response_msg = [
    {
        "role": "assistant",
        "content": [
            {
                "type": "tool_call",
                "tool_call": {
                    "id": "tool_001",
                    "type": "function",
                    "function": {"name": "get_order", "arguments": {"order_id": "123"}},
                },
            }
        ],
    },
    {
        "role": "tool",
        "tool_call_id": "tool_001",
        "content": [
            {
                "type": "tool_result",
                "tool_result": "{ \"order\": { \"id\": \"123\", \"status\": \"shipped\" } }",
            }
        ],
    },
    {
        "role": "assistant",
        "content": [
            {"type": "text", "text": "Your order #123 has been shipped."}
        ],
    },
]

tool_defs = [
    {
        "name": "get_order",
        "description": "Get order details.",
        "parameters": {
            "type": "object",
            "properties": {"order_id": {"type": "string"}},
        },
    }
]

task(query=query_msg, response=response_msg, tool_definitions=tool_defs)

### 2.2  Pro evaluators (Groundedness Pro, Indirect Attack …)
Also require `AZURE_AI_PROJECT_URL` + credentials.

In [None]:
from azure.ai.evaluation import IndirectAttackEvaluator, GroundednessProEvaluator

indirect = IndirectAttackEvaluator(azure_ai_project=azure_ai_project, credential=cred)
ground_pro = GroundednessProEvaluator(azure_ai_project=azure_ai_project, credential=cred)

indirect(query="What is the capital of France?", response="Paris")
ground_pro(
    query="What shape has 4 equilateral sides?",
    response="Rhombus",
    context="Rhombus is a shape with 4 equilateral sides."
)

---
## Done
You can now explore or adapt the individual evaluator calls as required.