# RAG Evaluation

## Load Dependencies

%pip install azure-ai-evaluation

%pip install promptflow-azure

## Load Azure configurations

In [1]:
from dotenv import load_dotenv
import os

load_dotenv() # take environment variables from .env.

azure_openai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
azure_openai_key = os.getenv("AZURE_OPENAI_KEY")
azure_openai_deployment = os.getenv("AZURE_OPENAI_DEPLOYMENT")

model_config = {
    "azure_endpoint": azure_openai_endpoint,
    "api_key": azure_openai_key,
    "azure_deployment": azure_openai_deployment,
}

azure_subscription_id = os.getenv("AZURE_SUBSCRIPTION_ID")
azure_resource_group_name = os.getenv("AZURE_RESOURCE_GROUP_NAME")
azure_project_name = os.getenv("AZURE_PROJECT_NAME")

azure_ai_project = {
    "subscription_id": azure_subscription_id,
    "resource_group_name": azure_resource_group_name,
    "project_name": azure_project_name,
}

## Get the first row to test

In [2]:
import json

# Load JSON data from a file
with open('Data/output/nasabooks-evalset.jsonl', 'r') as file:
    data = [json.loads(line) for line in file]

# Assuming the JSON structure is a list of dictionaries and we want the first row
first_row = data[0]

# Assign values to variables
context = first_row['context']
query = first_row['query']
ground_truth = first_row['ground_truth']
response = first_row['response']

## Performance Evaluators

In [3]:
from azure.ai.evaluation import (
    GroundednessEvaluator, 
    RelevanceEvaluator, 
    CoherenceEvaluator, 
    FluencyEvaluator, 
    SimilarityEvaluator, 
    F1ScoreEvaluator,
    RougeScoreEvaluator, 
    RougeType,
    BleuScoreEvaluator,
    MeteorScoreEvaluator,
    GleuScoreEvaluator
    )

groundedness_eval = GroundednessEvaluator(model_config)
groundedness_score = groundedness_eval(
    response=response,
    context=context,
)

relevance_eval = RelevanceEvaluator(model_config)
relevance_score = relevance_eval(
    response=response,
    context=context,
    query=query
)

coherence_eval = CoherenceEvaluator(model_config)
coherence_score = coherence_eval(
    response=response,
    query=query
)

fluency_eval = FluencyEvaluator(model_config)
fluency_score = fluency_eval(
    response=response,
    query=query
)

similarity_eval = SimilarityEvaluator(model_config)
similarity_score = similarity_eval(
    response=response,
    query=query,
    ground_truth=ground_truth
)

f1_eval = F1ScoreEvaluator()
f1_score = f1_eval(
    response=response,
    ground_truth=ground_truth
)

# There are several types of ROUGE metrics: ROUGE_1, ROUGE_2, ROUGE_3, ROUGE_4, ROUGE_5, and ROUGE_L.
rouge_eval = RougeScoreEvaluator(rouge_type=RougeType.ROUGE_1)
rouge_score = rouge_eval(
    response=response,
    ground_truth=ground_truth,
)

bleu_eval = BleuScoreEvaluator()
bleu_score = bleu_eval(
    response=response,
    ground_truth=ground_truth
)

meteor_eval = MeteorScoreEvaluator(
    alpha=0.9,
    beta=3.0,
    gamma=0.5
)
meteor_score = meteor_eval(
    response=response,
    ground_truth=ground_truth,
)

gleu_eval = GleuScoreEvaluator()
gleu_score = gleu_eval(
    response=response,
    ground_truth=ground_truth,
)

print(groundedness_score)
print(relevance_score)
print(coherence_score)
print(fluency_score)
print(similarity_score)
print(f1_score)
print(rouge_score)
print(bleu_score)
print(meteor_score)
print(gleu_score)

[INFO] Could not import AIAgentConverter. Please install the dependency with `pip install azure-ai-projects`.
[INFO] Could not import SKAgentConverter. Please install the dependency with `pip install semantic-kernel`.
{'groundedness': 5.0, 'gpt_groundedness': 5.0, 'groundedness_reason': 'The RESPONSE is fully grounded and complete, accurately conveying all essential information from the CONTEXT without introducing unsupported details or omitting critical points.', 'groundedness_result': 'pass', 'groundedness_threshold': 3}
{'relevance': 5.0, 'gpt_relevance': 5.0, 'relevance_reason': 'The RESPONSE directly answers the QUERY with accurate and complete information about the event that occurred off Vancouver Island in August 2016, making it fully relevant.', 'relevance_result': 'pass', 'relevance_threshold': 3}
{'coherence': 4.0, 'gpt_coherence': 4.0, 'coherence_reason': 'The RESPONSE is coherent and directly addresses the QUERY, providing relevant details and maintaining a logical flow of

## Risk and Safety Metrics

1. Install Azure CLI in Github Codespaces
- curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash

2. Login with your Azure account 
- az login --use-device-code
- Once you've logged in, select your subscription in the terminal.

In [7]:
from azure.ai.evaluation import ViolenceEvaluator, HateUnfairnessEvaluator, SelfHarmEvaluator,SexualEvaluator
from azure.identity import DefaultAzureCredential
#azure_ai_project = f"/subscriptions/{azure_subscription_id}/resourceGroups/{azure_resource_group_name}/providers/Microsoft.CognitiveServices/accounts/{azure_project_name}"

violence_eval = ViolenceEvaluator(azure_ai_project=azure_ai_project, credential=DefaultAzureCredential())
violence_score = violence_eval(query=query, response=response)

hateunfairness_eval = HateUnfairnessEvaluator(azure_ai_project=azure_ai_project, credential=DefaultAzureCredential())
hateunfairness_score = hateunfairness_eval(query=query, response=response)

selfharm_eval = SelfHarmEvaluator(azure_ai_project=azure_ai_project, credential=DefaultAzureCredential())
selfharm_score = selfharm_eval(query=query, response=response)

sexual_eval = SexualEvaluator(azure_ai_project=azure_ai_project, credential=DefaultAzureCredential())
sexual_score = sexual_eval(query=query, response=response)

print(violence_score)
print(hateunfairness_score)
print(selfharm_score)
print(sexual_score)

EvaluationException: (UserError) Failed to connect to your Azure AI project. Please check if the project scope is configured correctly, and make sure you have the necessary access permissions. Status code: 404.
Visit https://aka.ms/azsdk/python/evaluation/safetyevaluator/troubleshoot to troubleshoot this issue.

## Evaluate test dataset

In [6]:
from azure.ai.evaluation import evaluate
from azure.ai.evaluation import GroundednessEvaluator, RetrievalEvaluator, RelevanceEvaluator, CoherenceEvaluator, FluencyEvaluator, SimilarityEvaluator, F1ScoreEvaluator
from azure.ai.evaluation import RougeScoreEvaluator, RougeType
from azure.ai.evaluation import BleuScoreEvaluator
from azure.ai.evaluation import MeteorScoreEvaluator
from azure.ai.evaluation import GleuScoreEvaluator
from azure.ai.evaluation import ViolenceEvaluator, HateUnfairnessEvaluator, SelfHarmEvaluator,SexualEvaluator
from azure.identity import DefaultAzureCredential
import pandas as pd

groundedness_eval = GroundednessEvaluator(model_config)
retrieval_eval = RetrievalEvaluator(model_config)
relevance_eval = RelevanceEvaluator(model_config)
coherence_eval = CoherenceEvaluator(model_config)
fluency_eval = FluencyEvaluator(model_config)
similarity_eval = SimilarityEvaluator(model_config)
f1_eval = F1ScoreEvaluator()
rouge_eval = RougeScoreEvaluator(rouge_type=RougeType.ROUGE_1)
bleu_eval = BleuScoreEvaluator()
meteor_eval = MeteorScoreEvaluator(
    alpha=0.9,
    beta=3.0,
    gamma=0.5
)
gleu_eval = GleuScoreEvaluator()
violence_eval = ViolenceEvaluator(azure_ai_project=azure_ai_project, credential=DefaultAzureCredential())
hateunfairness_eval = HateUnfairnessEvaluator(azure_ai_project=azure_ai_project, credential=DefaultAzureCredential())
selfharm_eval = SelfHarmEvaluator(azure_ai_project=azure_ai_project, credential=DefaultAzureCredential())
sexual_eval = SexualEvaluator(azure_ai_project=azure_ai_project, credential=DefaultAzureCredential())

path = "Data/output/nasabooks-evalset.jsonl"

result = evaluate(
    data=path, # provide your data here
    evaluators={
        "groundedness": groundedness_eval,
        "retrieval": retrieval_eval,
        "relevance": relevance_eval,
        "coherence": coherence_eval,
        "fluency": fluency_eval,
        "similarity":similarity_eval,
        "f1_score": f1_eval,
        "rouge_score": rouge_eval,
        "bleu_score": bleu_eval,
        "meteor_score": meteor_eval,
        "gleu_score": gleu_eval,
        "violence_score": violence_eval,
        "hateunfairness_score": hateunfairness_eval,
        "selfharm_score": selfharm_eval,
        "sexual_score": sexual_eval         
    },
    # column mapping
    evaluator_config={
        "default": {
            "query": "${data.query}",
            "response": "${data.response}",
            "context": "${data.context}",
            "ground_truth": "${data.ground_truth}"
        }
    }
)

df = pd.DataFrame(result["rows"])
# Save the DataFrame to a CSV file
df.to_csv('Data/output/nasabooks-evalresult.csv', index=False)

print("DataFrame has been successfully saved to nasabooks-evalresult.csv")

[2025-07-04 08:48:36 +0000][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_similarity_20250704_084836_237977, log path: /home/codespace/.promptflow/.runs/azure_ai_evaluation_evaluators_similarity_20250704_084836_237977/logs.txt
[2025-07-04 08:48:36 +0000][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_fluency_20250704_084836_237228, log path: /home/codespace/.promptflow/.runs/azure_ai_evaluation_evaluators_fluency_20250704_084836_237228/logs.txt
[2025-07-04 08:48:36 +0000][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_coherence_20250704_084836_236885, log path: /home/codespace/.promptflow/.runs/azure_ai_evaluation_evaluators_coherence_20250704_084836_236885/logs.txt
[2025-07-04 08:48:36 +0000][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_retrieval_20250704_084836_235729, log pat

Traceback (most recent call last):
  File "/workspaces/azure-ai-rag/.venv/lib/python3.12/site-packages/azure/ai/evaluation/_legacy/prompty/_prompty.py", line 382, in _send_with_retries
    response = await client.chat.completions.create(**params)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/azure-ai-rag/.venv/lib/python3.12/site-packages/promptflow/tracing/_integrations/_openai_injector.py", line 88, in wrapper
    return await f(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/azure-ai-rag/.venv/lib/python3.12/site-packages/promptflow/tracing/_trace.py", line 476, in wrapped
    return await func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/azure-ai-rag/.venv/lib/python3.12/site-packages/openai/resources/chat/completions/completions.py", line 2028, in create
    return await self._post(
           ^^^^^^^^^^^^^^^^^
  File "/workspaces/azure-ai-rag/.venv/lib/python3.12/site-packages/openai/_base

[2025-07-04 08:49:43 +0000][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_f1_score_20250704_084836_238992, log path: /home/codespace/.promptflow/.runs/azure_ai_evaluation_evaluators_f1_score_20250704_084836_238992/logs.txt
[2025-07-04 08:49:44 +0000][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_rouge_score_20250704_084836_239045, log path: /home/codespace/.promptflow/.runs/azure_ai_evaluation_evaluators_rouge_score_20250704_084836_239045/logs.txt
[2025-07-04 08:49:44 +0000][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_bleu_score_20250704_084836_239075, log path: /home/codespace/.promptflow/.runs/azure_ai_evaluation_evaluators_bleu_score_20250704_084836_239075/logs.txt
[2025-07-04 08:49:45 +0000][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_meteor_score_20250704_084836_239100

2025-07-04 08:49:46 +0000   67620 execution.bulk     INFO     Finished 12 / 12 lines.
2025-07-04 08:49:46 +0000   67620 execution.bulk     INFO     Average execution time for completed lines: 5.87 seconds. Estimated time for incomplete lines: 0.0 seconds.
2025-07-04 08:49:46 +0000   67620 execution.bulk     INFO     Current thread is not main thread, skip signal handler registration in BatchEngine.


[2025-07-04 08:49:47 +0000][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_sexual_score_20250704_084836_239213, log path: /home/codespace/.promptflow/.runs/azure_ai_evaluation_evaluators_sexual_score_20250704_084836_239213/logs.txt


2025-07-04 08:49:47 +0000   67620 execution.bulk     INFO     Finished 12 / 12 lines.
2025-07-04 08:49:47 +0000   67620 execution.bulk     INFO     Average execution time for completed lines: 5.94 seconds. Estimated time for incomplete lines: 0.0 seconds.
2025-07-04 08:50:05 +0000   67620 execution.bulk     INFO     Finished 9 / 12 lines.
2025-07-04 08:50:05 +0000   67620 execution.bulk     INFO     Average execution time for completed lines: 2.18 seconds. Estimated time for incomplete lines: 6.54 seconds.
2025-07-04 08:50:05 +0000   67620 execution.bulk     INFO     Finished 11 / 12 lines.
2025-07-04 08:50:05 +0000   67620 execution.bulk     INFO     Average execution time for completed lines: 1.79 seconds. Estimated time for incomplete lines: 1.79 seconds.
2025-07-04 08:50:05 +0000   67620 execution.bulk     INFO     Finished 12 / 12 lines.
2025-07-04 08:50:05 +0000   67620 execution.bulk     INFO     Average execution time for completed lines: 1.67 seconds. Estimated time for incomp

 Please check out /home/codespace/.promptflow/.runs/azure_ai_evaluation_evaluators_violence_score_20250704_084836_239146 for more details.


2025-07-04 08:50:07 +0000   67620 execution.bulk     INFO     Finished 8 / 12 lines.
2025-07-04 08:50:07 +0000   67620 execution.bulk     INFO     Average execution time for completed lines: 2.6 seconds. Estimated time for incomplete lines: 10.4 seconds.
2025-07-04 08:50:07 +0000   67620 execution.bulk     INFO     Finished 10 / 12 lines.
2025-07-04 08:50:07 +0000   67620 execution.bulk     INFO     Average execution time for completed lines: 2.08 seconds. Estimated time for incomplete lines: 4.16 seconds.
2025-07-04 08:50:07 +0000   67620 execution.bulk     INFO     Finished 11 / 12 lines.
2025-07-04 08:50:07 +0000   67620 execution.bulk     INFO     Average execution time for completed lines: 1.89 seconds. Estimated time for incomplete lines: 1.89 seconds.
2025-07-04 08:50:08 +0000   67620 execution.bulk     INFO     Finished 3 / 12 lines.
2025-07-04 08:50:08 +0000   67620 execution.bulk     INFO     Average execution time for completed lines: 7.19 seconds. Estimated time for incompl

 Please check out /home/codespace/.promptflow/.runs/azure_ai_evaluation_evaluators_selfharm_score_20250704_084836_239192 for more details.


2025-07-04 08:50:08 +0000   67620 execution.bulk     INFO     Finished 12 / 12 lines.
2025-07-04 08:50:08 +0000   67620 execution.bulk     INFO     Average execution time for completed lines: 1.85 seconds. Estimated time for incomplete lines: 0.0 seconds.
2025-07-04 08:50:08 +0000   67620 execution          ERROR    12/12 flow run failed, indexes: [2,0,1,3,8,7,5,6,4,10,9,11], exception of index 2: (UserError) Failed to connect to your Azure AI project. Please check if the project scope is configured correctly, and make sure you have the necessary access permissions. Status code: 404.
Visit https://aka.ms/azsdk/python/evaluation/safetyevaluator/troubleshoot to troubleshoot this issue.


 Please check out /home/codespace/.promptflow/.runs/azure_ai_evaluation_evaluators_hateunfairness_score_20250704_084836_239170 for more details.


2025-07-04 08:50:09 +0000   67620 execution.bulk     INFO     Finished 12 / 12 lines.
2025-07-04 08:50:09 +0000   67620 execution.bulk     INFO     Average execution time for completed lines: 1.83 seconds. Estimated time for incomplete lines: 0.0 seconds.
2025-07-04 08:50:09 +0000   67620 execution          ERROR    12/12 flow run failed, indexes: [3,6,7,1,5,8,2,0,4,9,10,11], exception of index 3: (UserError) Failed to connect to your Azure AI project. Please check if the project scope is configured correctly, and make sure you have the necessary access permissions. Status code: 404.
Visit https://aka.ms/azsdk/python/evaluation/safetyevaluator/troubleshoot to troubleshoot this issue.


 Please check out /home/codespace/.promptflow/.runs/azure_ai_evaluation_evaluators_sexual_score_20250704_084836_239213 for more details.


## Assign yourself the Proper role to Track results in Azure AI Foundry

1. Get your user ID

az ad signed-in-user show --query id --output tsv

2. Assign yourself the Storage Blob Data Contributor role. Replace the placeholder text with your subscription ID, resource group, and user ID.

az role assignment create --role "Storage Blob Data Contributor" --scope /subscriptions/mySubscriptionID/resourceGroups/myResourceGroupName --assignee-principal-type User --assignee-object-id "user-id"

## Run Evaluation and Track in Azure AI Foundry

In [8]:
from azure.ai.evaluation import evaluate
from azure.ai.evaluation import GroundednessEvaluator, RetrievalEvaluator, RelevanceEvaluator, CoherenceEvaluator, FluencyEvaluator, SimilarityEvaluator, F1ScoreEvaluator
from azure.ai.evaluation import RougeScoreEvaluator, RougeType
from azure.ai.evaluation import BleuScoreEvaluator
from azure.ai.evaluation import MeteorScoreEvaluator
from azure.ai.evaluation import GleuScoreEvaluator
from azure.ai.evaluation import ViolenceEvaluator, HateUnfairnessEvaluator, SelfHarmEvaluator,SexualEvaluator
from azure.identity import DefaultAzureCredential
import pandas as pd

groundedness_eval = GroundednessEvaluator(model_config)
retrieval_eval = RetrievalEvaluator(model_config)
relevance_eval = RelevanceEvaluator(model_config)
coherence_eval = CoherenceEvaluator(model_config)
fluency_eval = FluencyEvaluator(model_config)
similarity_eval = SimilarityEvaluator(model_config)
f1_eval = F1ScoreEvaluator()
rouge_eval = RougeScoreEvaluator(rouge_type=RougeType.ROUGE_1)
bleu_eval = BleuScoreEvaluator()
meteor_eval = MeteorScoreEvaluator(
    alpha=0.9,
    beta=3.0,
    gamma=0.5
)
gleu_eval = GleuScoreEvaluator()
violence_eval = ViolenceEvaluator(azure_ai_project=azure_ai_project, credential=DefaultAzureCredential())
hateunfairness_eval = HateUnfairnessEvaluator(azure_ai_project=azure_ai_project, credential=DefaultAzureCredential())
selfharm_eval = SelfHarmEvaluator(azure_ai_project=azure_ai_project, credential=DefaultAzureCredential())
sexual_eval = SexualEvaluator(azure_ai_project=azure_ai_project, credential=DefaultAzureCredential())

path = "Data/output/nasabooks-evalset.jsonl"

result = evaluate(
    data=path, # provide your data here
    evaluators={
        "groundedness": groundedness_eval,
        "retrieval": retrieval_eval,
        "relevance": relevance_eval,
        "coherence": coherence_eval,
        "fluency": fluency_eval,
        "similarity": similarity_eval,
        "f1_score": f1_eval,
        "rouge_score": rouge_eval,
        "bleu_score": bleu_eval,
        "meteor_score": meteor_eval,
        "gleu_score": gleu_eval,
        "violence_score": violence_eval,
        "hateunfairness_score": hateunfairness_eval,
        "selfharm_score": selfharm_eval,
        "sexual_score": sexual_eval 
    },
    # column mapping
    evaluator_config={
        "default": {
            "query": "${data.query}",
            "response": "${data.response}",
            "context": "${data.context}",
            "ground_truth": "${data.ground_truth}"
        }
    },
    azure_ai_project = azure_ai_project
)


[2025-07-04 08:54:36 +0000][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_retrieval_20250704_085436_726597, log path: /home/codespace/.promptflow/.runs/azure_ai_evaluation_evaluators_retrieval_20250704_085436_726597/logs.txt
[2025-07-04 08:54:36 +0000][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_coherence_20250704_085436_727476, log path: /home/codespace/.promptflow/.runs/azure_ai_evaluation_evaluators_coherence_20250704_085436_727476/logs.txt
[2025-07-04 08:54:36 +0000][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_groundedness_20250704_085436_726224, log path: /home/codespace/.promptflow/.runs/azure_ai_evaluation_evaluators_groundedness_20250704_085436_726224/logs.txt
[2025-07-04 08:54:36 +0000][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_fluency_20250704_085436_727801, l

Traceback (most recent call last):
  File "/workspaces/azure-ai-rag/.venv/lib/python3.12/site-packages/azure/ai/evaluation/_legacy/prompty/_prompty.py", line 382, in _send_with_retries
    response = await client.chat.completions.create(**params)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/azure-ai-rag/.venv/lib/python3.12/site-packages/promptflow/tracing/_integrations/_openai_injector.py", line 88, in wrapper
    return await f(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/azure-ai-rag/.venv/lib/python3.12/site-packages/promptflow/tracing/_trace.py", line 476, in wrapped
    return await func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/azure-ai-rag/.venv/lib/python3.12/site-packages/openai/resources/chat/completions/completions.py", line 2028, in create
    return await self._post(
           ^^^^^^^^^^^^^^^^^
  File "/workspaces/azure-ai-rag/.venv/lib/python3.12/site-packages/openai/_base

[2025-07-04 08:55:41 +0000][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_f1_score_20250704_085436_729120, log path: /home/codespace/.promptflow/.runs/azure_ai_evaluation_evaluators_f1_score_20250704_085436_729120/logs.txt


2025-07-04 08:55:42 +0000   67620 execution.bulk     INFO     Finished 12 / 12 lines.
2025-07-04 08:55:42 +0000   67620 execution.bulk     INFO     Average execution time for completed lines: 5.46 seconds. Estimated time for incomplete lines: 0.0 seconds.


[2025-07-04 08:55:42 +0000][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_rouge_score_20250704_085436_729166, log path: /home/codespace/.promptflow/.runs/azure_ai_evaluation_evaluators_rouge_score_20250704_085436_729166/logs.txt
[2025-07-04 08:55:43 +0000][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_bleu_score_20250704_085436_729193, log path: /home/codespace/.promptflow/.runs/azure_ai_evaluation_evaluators_bleu_score_20250704_085436_729193/logs.txt


2025-07-04 08:55:43 +0000   67620 execution.bulk     INFO     Finished 12 / 12 lines.
2025-07-04 08:55:43 +0000   67620 execution.bulk     INFO     Average execution time for completed lines: 0.01 seconds. Estimated time for incomplete lines: 0.0 seconds.


[2025-07-04 08:55:43 +0000][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_meteor_score_20250704_085436_729216, log path: /home/codespace/.promptflow/.runs/azure_ai_evaluation_evaluators_meteor_score_20250704_085436_729216/logs.txt
[2025-07-04 08:55:44 +0000][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_gleu_score_20250704_085436_729238, log path: /home/codespace/.promptflow/.runs/azure_ai_evaluation_evaluators_gleu_score_20250704_085436_729238/logs.txt
[2025-07-04 08:55:44 +0000][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_violence_score_20250704_085436_729259, log path: /home/codespace/.promptflow/.runs/azure_ai_evaluation_evaluators_violence_score_20250704_085436_729259/logs.txt
[2025-07-04 08:55:44 +0000][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_hateunfairness_score_

2025-07-04 08:56:04 +0000   67620 execution.bulk     INFO     Finished 8 / 12 lines.
2025-07-04 08:56:04 +0000   67620 execution.bulk     INFO     Average execution time for completed lines: 2.52 seconds. Estimated time for incomplete lines: 10.08 seconds.
2025-07-04 08:56:04 +0000   67620 execution.bulk     INFO     Finished 9 / 12 lines.
2025-07-04 08:56:04 +0000   67620 execution.bulk     INFO     Average execution time for completed lines: 2.24 seconds. Estimated time for incomplete lines: 6.72 seconds.
2025-07-04 08:56:04 +0000   67620 execution.bulk     INFO     Finished 11 / 12 lines.
2025-07-04 08:56:04 +0000   67620 execution.bulk     INFO     Average execution time for completed lines: 1.84 seconds. Estimated time for incomplete lines: 1.84 seconds.
2025-07-04 08:56:05 +0000   67620 execution.bulk     INFO     Finished 12 / 12 lines.
2025-07-04 08:56:05 +0000   67620 execution.bulk     INFO     Average execution time for completed lines: 1.7 seconds. Estimated time for incomp

 Please check out /home/codespace/.promptflow/.runs/azure_ai_evaluation_evaluators_violence_score_20250704_085436_729259 for more details.


2025-07-04 08:56:06 +0000   67620 execution.bulk     INFO     Finished 9 / 12 lines.
2025-07-04 08:56:06 +0000   67620 execution.bulk     INFO     Average execution time for completed lines: 2.26 seconds. Estimated time for incomplete lines: 6.78 seconds.
2025-07-04 08:56:06 +0000   67620 execution.bulk     INFO     Finished 11 / 12 lines.
2025-07-04 08:56:06 +0000   67620 execution.bulk     INFO     Average execution time for completed lines: 1.85 seconds. Estimated time for incomplete lines: 1.85 seconds.
2025-07-04 08:56:06 +0000   67620 execution.bulk     INFO     Finished 12 / 12 lines.
2025-07-04 08:56:06 +0000   67620 execution.bulk     INFO     Average execution time for completed lines: 1.78 seconds. Estimated time for incomplete lines: 0.0 seconds.
2025-07-04 08:56:06 +0000   67620 execution          ERROR    12/12 flow run failed, indexes: [2,0,1,4,7,5,6,3,8,9,10,11], exception of index 2: (UserError) Failed to connect to your Azure AI project. Please check if the project sc

 Please check out /home/codespace/.promptflow/.runs/azure_ai_evaluation_evaluators_hateunfairness_score_20250704_085436_729279 for more details.


2025-07-04 08:56:06 +0000   67620 execution.bulk     INFO     Finished 12 / 12 lines.
2025-07-04 08:56:06 +0000   67620 execution.bulk     INFO     Average execution time for completed lines: 1.76 seconds. Estimated time for incomplete lines: 0.0 seconds.
2025-07-04 08:56:06 +0000   67620 execution          ERROR    12/12 flow run failed, indexes: [0,1,3,6,2,7,4,5,8,10,9,11], exception of index 0: (UserError) Failed to connect to your Azure AI project. Please check if the project scope is configured correctly, and make sure you have the necessary access permissions. Status code: 404.
Visit https://aka.ms/azsdk/python/evaluation/safetyevaluator/troubleshoot to troubleshoot this issue.
2025-07-04 08:56:07 +0000   67620 execution.bulk     INFO     Finished 3 / 12 lines.
2025-07-04 08:56:07 +0000   67620 execution.bulk     INFO     Average execution time for completed lines: 6.39 seconds. Estimated time for incomplete lines: 57.51 seconds.


 Please check out /home/codespace/.promptflow/.runs/azure_ai_evaluation_evaluators_selfharm_score_20250704_085436_729299 for more details.


2025-07-04 08:56:07 +0000   67620 execution.bulk     INFO     Finished 9 / 12 lines.
2025-07-04 08:56:07 +0000   67620 execution.bulk     INFO     Average execution time for completed lines: 2.14 seconds. Estimated time for incomplete lines: 6.42 seconds.
2025-07-04 08:56:07 +0000   67620 execution.bulk     INFO     Finished 10 / 12 lines.
2025-07-04 08:56:07 +0000   67620 execution.bulk     INFO     Average execution time for completed lines: 1.93 seconds. Estimated time for incomplete lines: 3.86 seconds.
2025-07-04 08:56:07 +0000   67620 execution.bulk     INFO     Finished 11 / 12 lines.
2025-07-04 08:56:07 +0000   67620 execution.bulk     INFO     Average execution time for completed lines: 1.75 seconds. Estimated time for incomplete lines: 1.75 seconds.
2025-07-04 08:56:07 +0000   67620 execution.bulk     INFO     Finished 12 / 12 lines.
2025-07-04 08:56:07 +0000   67620 execution.bulk     INFO     Average execution time for completed lines: 1.64 seconds. Estimated time for incom

 Please check out /home/codespace/.promptflow/.runs/azure_ai_evaluation_evaluators_sexual_score_20250704_085436_729320 for more details.


EvaluationException: (InternalError) The get 'azure-agentic-openai' workspace request failed with HTTP 404 - (ResourceNotFound) The Resource 'Microsoft.MachineLearningServices/workspaces/azure-agentic-openai' under resource group 'azure-agentic-ai' was not found. For more details please go to https://aka.ms/ARMResourceNotFoundFix

## View Evaluation Results

In [None]:
print(result['studio_url'])

## Evaluate Using a Custom Evaluator

In [None]:
from promptflow.client import load_flow

friendliness_eval = load_flow(source="friendliness.prompty", model={"configuration": model_config})
friendliness_score = friendliness_eval(
    query=query,
    response=response
)
print(friendliness_score)