# Prepare Configurations

In [1]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [8]:
import os
import json
from azure.identity import DefaultAzureCredential
from dotenv import load_dotenv 
from azure.ai.projects import AIProjectClient
from azure.ai.ml import MLClient


load_dotenv() 
credential = DefaultAzureCredential()

# Initialize Azure AI project and Azure OpenAI conncetion with your environment variables
azure_ai_project = {
    "subscription_id": os.environ.get("AZURE_SUBSCRIPTION_ID"),
    "resource_group_name": os.environ.get("AZURE_RESOURCE_GROUP"),
    "project_name": os.environ.get("AZURE_PROJECT_NAME"),
}

model_config = {
    "azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"),
    "azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
    "api_version": os.environ.get("AZURE_OPENAI_API_VERSION"),
}

project_client = AIProjectClient.from_connection_string(
    credential=DefaultAzureCredential(),
    conn_str=os.environ.get("AZURE_PROJECT_CONNECTION_STRING"),
)

# Define ml_client to register custom evaluator
ml_client = MLClient(
       subscription_id=os.environ["AZURE_SUBSCRIPTION_ID"],
       resource_group_name=os.environ["AZURE_RESOURCE_GROUP"],
       workspace_name=os.environ["AZURE_PROJECT_NAME"],
       credential=DefaultAzureCredential()
)


## Performance and quality evaluator
- GroundednessEvaluator uses LLM 
- GroundednessProEvaluator uses Azure Content Safety in Azure Foundary

In [3]:
from azure.ai.evaluation import GroundednessProEvaluator, GroundednessEvaluator

# Initialzing Groundedness and Groundedness Pro evaluators
groundedness_eval = GroundednessEvaluator(model_config)
groundedness_pro_eval = GroundednessProEvaluator(azure_ai_project=azure_ai_project, credential=credential)

query_response = dict(
    query="Which tent is the most waterproof?",
    context="The Alpine Explorer Tent is the most water-proof of all tents available.",
    response="The Alpine Explorer Tent is the most waterproof."
)

## Running Groundedness Evaluator on a query and response pair
groundedness_score = groundedness_eval(
    **query_response
)
print(json.dumps(groundedness_score, indent=2))

groundedness_pro_score = groundedness_pro_eval(
    **query_response
)
print(json.dumps(groundedness_pro_score, indent=2))


Class GroundednessProEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


{
  "groundedness": 5.0,
  "gpt_groundedness": 5.0,
  "groundedness_reason": "The response is fully grounded in the context, providing a complete and accurate answer to the query."
}
{
  "groundedness_pro_label": true,
  "groundedness_pro_reason": "All Contents are grounded"
}


## Risk and safety evaluator

In [4]:
from azure.ai.evaluation import ViolenceEvaluator
from azure.identity import DefaultAzureCredential
credential = DefaultAzureCredential()

# Initializing Violence Evaluator with project information
violence_eval = ViolenceEvaluator(credential=credential, azure_ai_project=azure_ai_project)
# Running Violence Evaluator on a query and response pair
violence_score = violence_eval(query="What is the capital of France?", response="Paris.")
print(json.dumps(violence_score, indent=2))

# Conversation mode

conversation_str =  """{"messages": [ { "content": "Which tent is the most waterproof?", "role": "user" }, { "content": "The Alpine Explorer Tent is the most waterproof", "role": "assistant", "context": "From the our product list the alpine explorer tent is the most waterproof. The Adventure Dining Table has higher weight." }, { "content": "How much does it cost?", "role": "user" }, { "content": "$120.", "role": "assistant", "context": "The Alpine Explorer Tent is $120."} ] }""" 
conversation = json.loads(conversation_str)
violence_conv_score = violence_eval(conversation=conversation) 
print(json.dumps(violence_conv_score, indent=2))


Class ViolenceEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


{
  "violence": "Very low",
  "violence_score": 0,
  "violence_reason": "The system's response is purely informational and does not contain any violent content. It simply provides the capital of France, which is Paris. There is no mention of violence, harm, or any related topics."
}
{
  "violence_score": 0.0,
  "evaluation_per_turn": {
    "violence": [
      "Very low",
      "Very low"
    ],
    "violence_score": [
      0,
      0
    ],
    "violence_reason": [
      "The system's response is a straightforward answer to a question about waterproof tents. There is no mention or depiction of violence, physical harm, or any other violent content. Therefore, the severity level is 0.",
      "The system's response does not contain any violent content. It is simply providing a price in response to the human's question. There is no mention of violence, harm, or any related topics."
    ]
  }
}


## Custom evaluator

In [None]:
from answer_len.answer_length import AnswerLengthEvaluator

answer_length_evaluator = AnswerLengthEvaluator()
answer_length = answer_length_evaluator(answer="What is the speed of light?")

print(json.dumps(answer_length, indent=2))

{
  "answer_length": 27
}


## Prompt-based evaluator


In [6]:
from friendliness.friend import FriendlinessEvaluator

friendliness_eval = FriendlinessEvaluator(model_config)

friendliness_score = friendliness_eval(response="I will not apologize for my behavior!")
print(json.dumps(friendliness_score, indent=2))

{
  "score": 1,
  "reason": "The response is defensive and lacks warmth or friendliness."
}


## Local evaluation

### Prerequisites
If you want to enable logging to your Azure AI project for evaluation results, follow these steps:

- Make sure you're first logged in by running az login.
- Make sure you have the Identity-based access setting for the storage account in your Azure AI hub. To find your storage, go to the Overview page of your Azure AI hub and select Storage.
- Make sure you have Storage Blob Data Contributor role for the storage account.

In [None]:
!az login
!az account show

In [7]:
from azure.ai.evaluation import evaluate

result = evaluate(
    data="data/data.jsonl", # provide your data here
    evaluators={
        "groundedness": groundedness_eval,
        "answer_length": answer_length_evaluator,
    },
    # column mapping
    evaluator_config={
        "groundedness": {
            "column_mapping": {
                "query": "${data.query}",
                "context": "${data.context}",
                "response": "${data.response}"
            } 
        },
        "answer_length":{
            "column_mapping": {
                "answer": "${data.response}"
            }
        }
    },
    # Optionally provide your Azure AI project information to track your evaluation results in your Azure AI project
    #azure_ai_project = azure_ai_project,
    # Optionally provide an output path to dump a json of metric summary, row level data and metric and Azure AI project URL
    output_path="./myevalresults.json"
)


[2025-01-21 15:29:16 +0900][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run answer_len_answer_length_answerlengthevaluator_hc0s3jbu_20250121_152915_307222, log path: C:\Users\kenakamu\.promptflow\.runs\answer_len_answer_length_answerlengthevaluator_hc0s3jbu_20250121_152915_307222\logs.txt
[2025-01-21 15:29:16 +0900][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_frvu7fh1_20250121_152915_307222, log path: C:\Users\kenakamu\.promptflow\.runs\azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_frvu7fh1_20250121_152915_307222\logs.txt


Prompt flow service has started...
Prompt flow service has started...
You can view the traces in local from http://127.0.0.1:23333/v1.0/ui/traces/?#run=answer_len_answer_length_answerlengthevaluator_hc0s3jbu_20250121_152915_307222
You can view the traces in local from http://127.0.0.1:23333/v1.0/ui/traces/?#run=azure_ai_evaluation_evaluators_common_base_eval_asyncevaluatorbase_frvu7fh1_20250121_152915_307222
2025-01-21 15:29:16 +0900   11980 execution.bulk     INFO     Current thread is not main thread, skip signal handler registration in BatchEngine.
2025-01-21 15:29:30 +0900   11980 execution.bulk     INFO     Finished 1 / 2 lines.
2025-01-21 15:29:30 +0900   11980 execution.bulk     INFO     Average execution time for completed lines: 13.66 seconds. Estimated time for incomplete lines: 13.66 seconds.
2025-01-21 15:29:30 +0900   11980 execution.bulk     INFO     Finished 2 / 2 lines.
2025-01-21 15:29:30 +0900   11980 execution.bulk     INFO     Average execution time for completed li

In [9]:
from azure.ai.ml.entities import Model
from promptflow.client import PFClient
from answer_len.answer_length import AnswerLengthEvaluator


# Then we convert it to evaluation flow and save it locally
pf_client = PFClient()
local_path = "answer_len_local"
pf_client.flows.save(entry=AnswerLengthEvaluator, path=local_path)

# Specify evaluator name to appear in the Evaluator library
evaluator_name = "AnswerLenEvaluator"

# Finally register the evaluator to the Evaluator library
custom_evaluator = Model(
    path=local_path,
    name=evaluator_name,
    description="Evaluator calculating answer length.",
)

registered_evaluator = ml_client.evaluators.create_or_update(custom_evaluator)
print("Registered evaluator id:", registered_evaluator.id)

# Registered evaluators have versioning. You can always reference any version available.
versioned_evaluator = ml_client.evaluators.get(evaluator_name, version=1)
print("Versioned evaluator id:", registered_evaluator.id)

Method evaluators: This is an experimental method, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


Registered evaluator id: /subscriptions/348342f5-c2c7-4e33-87ef-5af51241bea0/resourceGroups/ai/providers/Microsoft.MachineLearningServices/workspaces/kenakamuevalproject/models/AnswerLenEvaluator/versions/2
Versioned evaluator id: /subscriptions/348342f5-c2c7-4e33-87ef-5af51241bea0/resourceGroups/ai/providers/Microsoft.MachineLearningServices/workspaces/kenakamuevalproject/models/AnswerLenEvaluator/versions/2


### Upload test data

In [11]:

data_id, _ = project_client.upload_file(file_path="./data/data.jsonl")

### Run the test in the cloud

- Update the registered id which you can confirm from Foundry 

In [13]:
from azure.ai.projects.models import Evaluation, Dataset, EvaluatorConfiguration, ConnectionType
from azure.ai.evaluation import F1ScoreEvaluator, RelevanceEvaluator, ViolenceEvaluator

# Construct dataset ID per the instruction
data_id = data_id

default_connection = project_client.connections.get_default(connection_type=ConnectionType.AZURE_OPEN_AI)

# Use the same model_config for your evaluator (or use different ones if needed)
model_config = default_connection.to_evaluator_model_config(
    deployment_name= os.environ.get("AZURE_OPENAI_DEPLOYMENT"), 
    api_version= os.environ.get("AZURE_OPENAI_API_VERSION"))

# Create an evaluation
evaluation = Evaluation(
    display_name="Cloud evaluation",
    description="Evaluation of dataset",
    data=Dataset(id=data_id),
    evaluators={
        # Note the evaluator configuration key must follow a naming convention
        # the string must start with a letter with only alphanumeric characters 
        # and underscores. Take "f1_score" as example: "f1score" or "f1_evaluator" 
        # will also be acceptable, but "f1-score-eval" or "1score" will result in errors.
        "f1_score": EvaluatorConfiguration(
            id=F1ScoreEvaluator.id,
            data_mapping={
                "query": "${data.query}",
                "response": "${data.response}",
                "ground_truth": "${data.ground_truth}"
            }
        ),
        "relevance": EvaluatorConfiguration(
            id=RelevanceEvaluator.id,
            init_params={
                "model_config": model_config
            },
            data_mapping={
                "query": "${data.query}",
                "response": "${data.response}"
            }
        ),
        "violence": EvaluatorConfiguration(
            id=ViolenceEvaluator.id,
            init_params={
                "azure_ai_project": project_client.scope
            },
            data_mapping={
                "query": "${data.query}",
                "response": "${data.response}"
            }
        ),
        "answer_len": EvaluatorConfiguration(
            id="azureml://locations/eastus2/workspaces/d7a5a4fb-c4ab-4c30-9e69-ed3012477767/models/AnswerLenEvaluator/versions/2",
            data_mapping={
                "answer": "${data.response}"
            }
        )
    },
)

# Create evaluation
evaluation_response = project_client.evaluations.create(
    evaluation=evaluation,
)

# Get evaluation
get_evaluation_response = project_client.evaluations.get(evaluation_response.id)

print("----------------------------------------------------------------")
print("Created evaluation, evaluation ID: ", get_evaluation_response.id)
print("Evaluation status: ", get_evaluation_response.status)
print("AI project URI: ", get_evaluation_response.properties["AiStudioEvaluationUri"])
print("----------------------------------------------------------------")

----------------------------------------------------------------
Created evaluation, evaluation ID:  ccaafeb5-5a2f-4223-b4dc-4767e037642a
Evaluation status:  Starting
AI project URI:  https://ai.azure.com/build/evaluation/ccaafeb5-5a2f-4223-b4dc-4767e037642a?wsid=/subscriptions/348342f5-c2c7-4e33-87ef-5af51241bea0/resourceGroups/ai/providers/Microsoft.MachineLearningServices/workspaces/kenakamuevalproject
----------------------------------------------------------------
