# Install Required Libraries
Install the necessary Python libraries.

In [None]:
%pip install promptflow-evals
%pip install --upgrade promptflow

# Built-in evaluators

Built-in evaluators support the following application scenarios:
- Question and answer
- Chat

> Note: It is recommended to use GPT models that do not have the (preview) suffix for the best performance and parseable responses with the evaluators.

| Category | Evaluator class |
|----------|-----------------|
| Performance and quality | `GroundednessEvaluator`, `RelevanceEvaluator`, `CoherenceEvaluator`, `FluencyEvaluator`, `SimilarityEvaluator`, `F1ScoreEvaluator` |
| Risk and safety | `ViolenceEvaluator`, `SexualEvaluator`, `SelfHarmEvaluator`, `HateUnfairnessEvaluator` |
| Composite | `QAEvaluator`, `ChatEvaluator`, `ContentSafetyEvaluator`, `ContentSafetyChatEvaluator` |

See the required inputs for built-in evaluators [here](https://learn.microsoft.com/en-us/azure/ai-studio/how-to/develop/flow-evaluate-sdk#required-data-input-for-built-in-evaluators).

### Performance and Quality Evaluators
This is an example of a relevence evaluator. 

In [19]:
import os
import json
from dotenv import load_dotenv
from promptflow.core import AzureOpenAIModelConfiguration

load_dotenv("credentials.env")
print(os.getenv("AZURE_OPENAI_ENDPOINT"))

# Initialize Azure OpenAI Connection
model_config = AzureOpenAIModelConfiguration(
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    azure_deployment=os.getenv("AZURE_OPENAI_DEPLOYMENT"),
    api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
)
from promptflow.evals.evaluators import RelevanceEvaluator

# Initialzing Relevance Evaluator
relevance_eval = RelevanceEvaluator(model_config)
# Running Relevance Evaluator on single input row
relevance_score = relevance_eval(
    answer="The Alpine Explorer Tent is the most waterproof.",
    context="From the our product list,"
    " the alpine explorer tent is the most waterproof."
    " The Adventure Dining Table has higher weight.",
    question="Which tent is the most waterproof?",
)
print(json.dumps(relevance_score, indent=4))

https://gpt4-0sweden.openai.azure.com/
{
    "gpt_relevance": 5.0
}


### Risk and safety evaluators

> Note: to use this you must be logged into Azure in your current IDE, eg VsCode.

In [None]:
# %pip install azure-identity

#You may need to log into your tenant through the Azure CLI to authenticate

In [20]:
from azure.core.credentials import TokenCredential
from azure.identity import DefaultAzureCredential

In [21]:
#replace with your own project scope
project_scope = {
   "subscription_id": "13e90e3a-50db-43f8-86e5-42bca5b14ebd",
    "resource_group_name": "rg-aistudio-35hub",
    "project_name": "sk-project"
}


from promptflow.evals.evaluators import ViolenceEvaluator
from azure.identity import DefaultAzureCredential

# Initialize the credential variable
credential = DefaultAzureCredential()

# Initializing Violence Evaluator with project information
violence_eval = ViolenceEvaluator(project_scope)

# Running Violence Evaluator on single input row
violence_score = violence_eval(question="What is the capital of France?", answer="Paris.")
print(json.dumps(violence_score, indent=4))

[2024-08-06 13:53:54 +0100][flowinvoker][INFO] - Validating flow input with data {'metric_name': 'violence', 'question': 'What is the capital of France?', 'answer': 'Paris.', 'project_scope': {'subscription_id': '13e90e3a-50db-43f8-86e5-42bca5b14ebd', 'resource_group_name': 'rg-aistudio-35hub', 'project_name': 'sk-project'}, 'credential': None}
[2024-08-06 13:53:54 +0100][flowinvoker][INFO] - Execute flow with data {'metric_name': 'violence', 'question': 'What is the capital of France?', 'answer': 'Paris.', 'project_scope': {'subscription_id': '13e90e3a-50db-43f8-86e5-42bca5b14ebd', 'resource_group_name': 'rg-aistudio-35hub', 'project_name': 'sk-project'}, 'credential': None}


2024-08-06 13:53:54 +0100   32504 execution.flow     INFO     Start executing nodes in thread pool mode.
2024-08-06 13:53:54 +0100   32504 execution.flow     INFO     Start to run 2 nodes with concurrency level 16.
2024-08-06 13:53:54 +0100   32504 execution.flow     INFO     Executing node validate_inputs. node run id: 2f7c3cfe-3562-4c01-a455-b76fb67fc062_validate_inputs_898e9d4a-c638-4043-a111-864d0b865a02
2024-08-06 13:53:54 +0100   32504 execution.flow     INFO     Node validate_inputs completes.
2024-08-06 13:53:54 +0100   32504 execution.flow     INFO     The node 'evaluate_with_rai_service' will be executed because the activate condition is met, i.e. '${validate_inputs.output}' is equal to 'True'.
2024-08-06 13:53:54 +0100   32504 execution.flow     INFO     Executing node evaluate_with_rai_service. node run id: 2f7c3cfe-3562-4c01-a455-b76fb67fc062_evaluate_with_rai_service_0100095b-6727-49dc-bad5-03a8c855e7dc
2024-08-06 13:54:04 +0100   32504 execution.flow     INFO     Node ev

### Composite evaluators
Composite evaluators are built in evaluators that combine the individual quality or safety metrics to easily provide a wide range of metrics right out of the box.

In [24]:
from promptflow.evals.evaluators import ChatEvaluator

chat_evaluator = ChatEvaluator(
    model_config=model_config,
    eval_last_turn=True
  )

conversation = [
    {"role": "user", "content": "What is the value of 2 + 2?"},
    {"role": "assistant", "content": "2 + 2 = 4", "context": {
        "citations": [
                {"id": "math_doc.md", "content": "Information about additions: 1 + 2 = 3, 2 + 2 = 4"}
                ]
        }
    }
]
result = chat_evaluator(conversation=conversation)

print(json.dumps(result, indent=4))

  aggregated[metric] = np.nanmean(values)


{
    "evaluation_per_turn": {
        "gpt_relevance": {
            "score": [
                5.0
            ]
        },
        "gpt_groundedness": {
            "score": [
                5.0
            ]
        },
        "gpt_coherence": {
            "score": [
                5.0
            ]
        },
        "gpt_fluency": {
            "score": [
                NaN
            ]
        },
        "gpt_retrieval": {
            "score": [
                5.0
            ]
        }
    },
    "gpt_coherence": 5.0,
    "gpt_fluency": NaN,
    "gpt_groundedness": 5.0,
    "gpt_relevance": 5.0,
    "gpt_retrieval": 5.0
}


# Custom evaluators
Custom evaluators allow you to build your own code-based or prompt-based evaluator.

### Code-based evaluators
This is when code-based evaluators let you define metrics based on functions or a callable class. Given a simple Python class in an example `answer_length.py` that calculates the length of an answer:

In [None]:
with open("answer_length.py", mode='r') as fin:
    print(fin.read())
from answer_length import AnswerLengthEvaluator

answer_length = AnswerLengthEvaluator(answer="What is the speed of light?")

print(answer_length)

### Prompt-based evaluators
You can create your own prompty-based evaluator and run it on a row of data:

In [23]:
with open("apology.prompty") as fin:
    print(fin.read())
from promptflow.client import load_flow

# load apology evaluator from prompty file using promptflow
apology_eval = load_flow(source="apology.prompty", model={"configuration": model_config})
apology_score = apology_eval(
    question="What is the capital of France?", answer="Paris"
)
print(apology_score)

---
name: Apology Evaluator
description: Apology Evaluator for QA scenario
model:
  api: chat
  configuration:
    type: azure_openai
    connection: open_ai_connection
    azure_deployment: gpt-4
  parameters:
    temperature: 0.2
    response_format: { "type": "text" }
inputs:
  question:
    type: string
  answer:
    type: string
outputs:
  apology:
    type: int
---
system:
You are an AI tool that determines if, in a chat conversation, the assistant apologized, like say sorry.
Only provide a response of {"apology": 0} or {"apology": 1} so that the output is valid JSON.
Give a apology of 1 if apologized in the chat conversation.
{"apology": 0}


# Evaluate on test dataset using `evaluate()`
Use `evaluate()` to combine multiple evaluators on an entire test dataset.

In [None]:
%pip install promptflow-azure

In [None]:
%pip install promptflow

In [None]:
from promptflow.evals.evaluate import evaluate

result = evaluate(
    data="./data/data.jsonl", # provide your data here
    evaluators={
        "relevance": relevance_eval
        },
    # column mapping
    evaluator_config={
        "default": {
            "ground_truth": "${data.truth}"
        }
    }
)

[mslearn link](https://learn.microsoft.com/en-us/azure/ai-studio/how-to/develop/flow-evaluate-sdk)