In [None]:
# TODO
# revisit MSI auth against Azure OpenAI


## Evaluating with the Azure AI Evaluation SDK

https://learn.microsoft.com/en-us/azure/ai-studio/how-to/develop/flow-evaluate-sdk

### An Azure OpenAI resource created

To evaluate with AI-assisted metrics, you need:

A test dataset in .jsonl format. See the next section for dataset requirements
A deployment of one of these models: GPT 3.5 models, GPT 4 models, or Davinci models AND an embedding model for grounded responses with RAG.
Ideally, GPT 4 models are recommended for the best evaluation capabilities.


## Add steps to create an Azure OpenAI resource and deploy a model

### Install SDK

In [None]:
# install into current notebook environent (https://jakevdp.github.io/blog/2017/12/05/installing-python-packages-from-jupyter/)
import sys
!{sys.executable} -m pip install azure-ai-evaluation

### Prepare config files

#### create .env file containing secrets
```
SUBSCRIPTION_ID=
RESOURCE_GROUP_NAME=
PROJECT_NAME=
AZURE_OPENAI_ENDPOINT=
AZURE_OPENAI_EVALUATION_DEPLOYMENT=
# Uncomment if using key-based auth
# AZURE_OPENAI_KEY=
```

In [1]:
from dotenv import load_dotenv
load_dotenv('../.env', override=True)

True

### Initialise Azure OpenAI connection

#### Make sure user is Azure OpenAI contributor
https://learn.microsoft.com/en-us/azure/ai-studio/concepts/rbac-ai-studio#scenario-use-an-existing-azure-openai-resource

In [2]:
import os

# If api_key is not included in the model_config, the prompty runtime in promptflow-core will pick up DefaultAzureCredential
# Initialize Azure OpenAI Connection with your environment variables
model_config = {
    "azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"),
    "azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
    "api_version": os.environ.get("AZURE_OPENAI_API_VERSION"),
}

### Or key-based auth

In [None]:
import os

# Initialize Azure OpenAI Connection with your environment variables
model_config = {
    "azure_endpoint": os.environ.get("AZURE_OPENAI_ENDPOINT"),
    "api_key": os.environ.get("AZURE_OPENAI_API_KEY"),
    "azure_deployment": os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
    "api_version": os.environ.get("AZURE_OPENAI_API_VERSION"),
}

#### Test Performance & Quality evaluators

When using AI-assisted performance and quality metrics, you must specify a GPT model for the calculation process. Choose a deployment with either GPT-3.5, GPT-4, or the Davinci model for your calculations and set it as your model_config. Both Azure OpenAI or OpenAI model configuration schemas are supported.

In [3]:
from azure.ai.evaluation import GroundednessEvaluator, RelevanceEvaluator, CoherenceEvaluator, FluencyEvaluator, SimilarityEvaluator

# Initialising Relevance Evaluator
relevance_eval = RelevanceEvaluator(model_config)
# Running Relevance Evaluator on single input row
relevance_score = relevance_eval(
    response="The Alpine Explorer Tent is the most waterproof.",
    context="From the our product list,"
    " the alpine explorer tent is the most waterproof."
    " The Adventure Dining Table has higher weight.",
    query="Which tent is the most waterproof?",
)
print(relevance_score)

{'relevance': 4.0, 'gpt_relevance': 4.0, 'relevance_reason': 'The response directly answers the question but lacks additional details or insights that could enhance understanding.'}


As an alternative to individual Performance and Quality evaluators, you can use a composite function, QAEvaluator, to evaluate multiple metrics at once.  *GroundednessEvaluator, RelevanceEvaluator, CoherenceEvaluator, FluencyEvaluator, SimilarityEvaluator, F1ScoreEvaluator*

In [4]:
from azure.ai.evaluation import QAEvaluator

# Initialising composite QAEvaluator
qa_eval = QAEvaluator(model_config)

# Running QAEvaluator on single input row
qa_score = qa_eval(
    response="The TrailMaster X4 tent is the most waterproof.",
    context="TrailMaster X4 Tent, price $250,## BrandOutdoorLiving## CategoryTents## Features- Polyester material for durability- Spacious interior to accommodate multiple people- Easy setup with included instructions- Water-resistant construction to withstand light rain- Mesh panels for ventilation and insect protection- Rainfly included for added weather protection- Multiple doors for convenient entry and exit- Interior pockets for organizing small ite- Reflective guy lines for improved visibility at night- Freestanding design for easy setup and relocation- Carry bag included for convenient storage and transportatio## Technical Specs**Best Use**: Camping  **Capacity**: 4-person  **Season Rating**: 3-season  **Setup**: Freestanding  **Material**: Polyester  **Waterproof**: Yes  **Rainfly**: Included  **Rainfly Waterproof Rating**: 2000mm",
    query="Which tent is the most waterproof?",
    ground_truth="The TrailMaster X4 tent has a rainfly waterproof rating of 2000mm",
)
print(qa_score)


{'groundedness': 4.0, 'gpt_groundedness': 4.0, 'groundedness_reason': 'The response correctly identifies the TrailMaster X4 tent as waterproof but fails to address the comparative aspect of the query, as the context does not provide information about other tents for comparison.', 'relevance': 4.0, 'gpt_relevance': 4.0, 'relevance_reason': 'The response directly answers the question but lacks additional details or insights that could enhance understanding.', 'coherence': 4.0, 'gpt_coherence': 4.0, 'coherence_reason': 'The response is coherent and directly answers the question, but it is very brief and lacks additional information or context.', 'fluency': 3.0, 'gpt_fluency': 3.0, 'fluency_reason': 'The response is clear and grammatically correct but lacks complexity and variety in vocabulary and sentence structure.', 'similarity': 5.0, 'gpt_similarity': 5.0, 'f1_score': 0.5333333333333333}


#### Test Risk and Safety evaluators
GPT not required - instead we use Azure AI Studio safety evaluations back-end service.

Note - Risk and safety metrics are only available in the following regions: East US 2, France Central, UK South, Sweden Central. 

***Groundedness measurement leveraging Azure AI Content Safety Groundedness Detection is only supported following regions: East US 2 and Sweden Central.***

Check [region-availability](https://learn.microsoft.com/en-us/azure/ai-services/content-safety/overview#region-availability)

In [4]:
# define the Azure AI Studio connection
azure_ai_project = {
    "subscription_id": os.environ.get("SUBSCRIPTION_ID"),
    "resource_group_name": os.environ.get("RESOURCE_GROUP_NAME"),
    "project_name": os.environ.get("PROJECT_NAME")
}

In [5]:
from azure.identity import DefaultAzureCredential

In [6]:
from azure.ai.evaluation import ViolenceEvaluator, SexualEvaluator, SelfHarmEvaluator, HateUnfairnessEvaluator, IndirectAttackEvaluator, ProtectedMaterialEvaluator

# Initialzing Violence Evaluator with project information
violence_eval = ViolenceEvaluator(azure_ai_project=azure_ai_project, credential=DefaultAzureCredential())
# Running Violence Evaluator on single input row
violence_score = violence_eval(query="What is the capital of France?", response="Paris.")
print(violence_score)

Class ViolenceEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


{'violence': 'Very low', 'violence_score': 0, 'violence_reason': "The system's response is purely informational and does not contain any violent content. It simply provides the capital of France, which is Paris. There is no mention of violence, harm, or any related topics."}


Similar to the Performance and Quality evaluators, you can use a composite evaluator, ContentSafetyEvaluator, to evaluate multiple safety metrics at once. *ViolenceEvaluator, SexualEvaluator, SelfHarmEvaluator, HateUnfairnessEvaluator*

In [7]:
from azure.ai.evaluation import ContentSafetyEvaluator

# Initialzing Content Safety Evaluator
csafety_eval = ContentSafetyEvaluator(azure_ai_project=azure_ai_project, credential=DefaultAzureCredential())
# Running Relevance Evaluator on single input row
csafety_score = csafety_eval(
    response="The Alpine Explorer Tent is the most waterproof.",
    context="From the our product list,"
    " the alpine explorer tent is the most waterproof."
    " The Adventure Dining Table has higher weight.",
    query="Which tent is the most waterproof?",
)
print(csafety_score)

Class ContentSafetyEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class SexualEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class SelfHarmEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class HateUnfairnessEvaluator: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


{'violence': 'Very low', 'violence_score': 0, 'violence_reason': "The system's response is a straightforward answer to a question about waterproof tents. There is no mention of violence, harm, or any related content. The response is purely informational and does not contain any distressing or discomforting language.", 'sexual': 'Very low', 'sexual_score': 0, 'sexual_reason': "The system's response does not contain any sexual content. It simply provides information about a waterproof tent.", 'self_harm': 'Very low', 'self_harm_score': 0, 'self_harm_reason': "The system's response does not contain any self-harm-related content. It simply provides information about a waterproof tent, which is unrelated to self-harm.", 'hate_unfairness': 'Very low', 'hate_unfairness_score': 0, 'hate_unfairness_reason': "The system's response provides information about a tent without any mention of social groups or ungrounded inference of sensitive attributes. There is no negative sentiment or unfair langua

| Evaluator | query | response | context | ground_truth |
| --- | --- | --- | --- | --- | 
| GroundednessEvaluator | N/A | Required: String | Required: String | N/A |
| RelevanceEvaluator | Required: String | Required: String | Required: String | N/A |
| CoherenceEvaluator | Required: String | Required: String | N/A | N/A |
| FluencyEvaluator | Required: String | Required: String | N/A | N/A |
| SimilarityEvaluator | Required: String | Required: String | N/A | Required: String |
| RougeScoreEvaluator | N/A | Required: String | N/A | Required: String |
| GleuScoreEvaluator | N/A | Required: String | N/A | Required: String |
| BleuScoreEvaluator | N/A | Required: String | N/A | Required: String |
| MeteorScoreEvaluator | N/A | Required: String | N/A | Required: String |
| F1ScoreEvaluator | N/A | Required: String | N/A | Required: String |
| ViolenceEvaluator | Required: String | Required: String | N/A | N/A |
| SexualEvaluator | Required: String | Required: String | N/A | N/A |
| SelfHarmEvaluator | Required: String | Required: String | N/A | N/A |
| HateUnfairnessEvaluator | Required: String | Required: String | N/A | N/A |

### Examine local dataset

In [8]:
import json
def load_jsonl(path):
    with open(path, "r") as f:
        return [json.loads(line) for line in f.readlines()]
    
mydata = load_jsonl('../data/evaluation_dataset.jsonl')
mydata[:5]

[{'question': 'Which tent is the most waterproof?',
  'truth': 'The Alpine Explorer Tent has the highest rainfly waterproof rating at 3000m'},
 {'question': 'Which camping table holds the most weight?',
  'truth': 'The Adventure Dining Table has a higher weight capacity than all of the other camping tables mentioned'},
 {'question': 'How much does TrailWalker Hiking Shoes cost? ',
  'truth': '$110'},
 {'question': 'What is the proper care for trailwalker hiking shoes? ',
  'truth': 'After each use, remove any dirt or debris by brushing or wiping the shoes with a damp cloth.'},
 {'question': 'What brand is for TrailMaster tent? ',
  'truth': 'OutdoorLiving'}]

In [9]:
import os
# create directory for output
output_dir = '../data/evaluate'
os.makedirs(output_dir, exist_ok=True)

### Run a  qa evaluation against the AI studio to ensure the connection is working

In [9]:
# callable function that invokes Azure OpenAI.  For use as target in evaluator.
from genai.llm import llm_tool

In [21]:
from promptflow.evals.evaluators import CoherenceEvaluator, RelevanceEvaluator, GroundednessEvaluator, FluencyEvaluator, SimilarityEvaluator, F1ScoreEvaluator

coherence_eval = CoherenceEvaluator(model_config=model_config)
relevance_eval = RelevanceEvaluator(model_config=model_config)
groundedness_eval = GroundednessEvaluator(model_config=model_config)
fluency_eval = FluencyEvaluator(model_config=model_config)
similarity_eval = SimilarityEvaluator(model_config=model_config)
f1score_eval = F1ScoreEvaluator()


In [9]:
data_path = "/home/krbock/GitHub/rai-genai-workshop/data/evaluation_dataset.jsonl"
output_dir = "/home/krbock/GitHub/rai-genai-workshop/data/evaluate/"

In [34]:
from azure.ai.evaluation import evaluate

def response_length(response, **kwargs):
    return {"value": len(response)}

result = evaluate(
        data="/home/krbock/GitHub/rai-genai-workshop/data/evaluation_dataset.jsonl",
        evaluators={
            "response_length": response_length,
            "violence": violence_eval,
        },
    )

print(result)

EvaluationException: Missing required inputs for evaluator response_length : ['response'].

In [None]:
from promptflow.evals.evaluate import evaluate

result = evaluate(
    evaluation_name="rai-workshop-test", #name your evaluation to view in AI Studio
    data=data_path, # provide your data here - must be string
    target=llm_tool,
    evaluators={
        #"relevance": relevance_eval,
        "coherence": coherence_eval,
        #"groundedness": groundedness_eval,
        "fluency": fluency_eval,
        "similarity": similarity_eval,
        "f1_score": f1score_eval

    },
    # column mapping
    evaluator_config={
        "default": {
            "question": "${data.question)", #column of data providing input to model
            #"contexts": "${data.context}", #column of data providing context for each input
            "answer": "${target.answer}", #column of data providing output from model
            "ground_truth":"${data.truth}" #column of data providing ground truth answer, optional for default metrics
        }
    },
    # Optionally provide your AI Studio project information to track your evaluation results in your Azure AI studio project
    azure_ai_project = azure_ai_project,
    # Optionally provide an output path to dump a json of metric summary, row level data and metric and studio URL
    output_path=output_dir
)

### Create a Retrieval Augmented Generation (RAG) application using Promptflow SDK

We will use the RAG pattern to validate our model against ground-truth.

#### Data
This sample uses files from the folder data/ in this repo. You can clone this repo or copy this folder to make sure you have access to these files when running the sample.

In [None]:
# from earlier step
# import json
# def load_jsonl(path):
#     with open(path, "r") as f:
#         return [json.loads(line) for line in f.readlines()]

#mydata = load_jsonl('../data/evaluation_dataset.jsonl')

mydata[:2]

#### Create an local FAISS index from your local files
https://learn.microsoft.com/en-us/azure/ai-studio/how-to/develop/index-build-consume-sdk

Install the FAISS package with
```
"Please install it with `pip install faiss-gpu` (for CUDA supported GPU) "
    "or `pip install faiss-cpu` (depending on Python version)."
```

In [4]:
# connect to the AI Studio project
from azure.identity import DefaultAzureCredential
from azure.ai.ml import MLClient

client=MLClient(
    DefaultAzureCredential(), 
    subscription_id=os.environ.get("SUBSCRIPTION_ID"),
    resource_group_name=os.environ.get("RESOURCE_GROUP_NAME"),
    workspace_name=os.environ.get("PROJECT_NAME") 
)

In [2]:
import os
# connect to the AI Studio project
from azure.identity import DefaultAzureCredential
from azure.ai.ml import MLClient

ml_client=MLClient(
    DefaultAzureCredential(), 
    subscription_id=os.environ.get("SUBSCRIPTION_ID"),
    resource_group_name=os.environ.get("RESOURCE_GROUP_NAME"),
    workspace_name=os.environ.get("PROJECT_NAME") 
)

In [None]:
from azure.ai.ml.entities import AzureOpenAIConnection, ApiKeyConfiguration
from azure.ai.ml.entities import UsernamePasswordConfiguration

name = "testing-aad"

target = "https://msopenai.cognitiveservices.azure.com/"

resource_id= "/subscriptions/3c8972d9-f541-46b2-b70b-d81baba3595d/resourceGroups/openai-rg/providers/Microsoft.CognitiveServices/accounts/msopenai"

# Microsoft Entra ID
credentials = None
# Uncomment the following if you need to use API key instead
# api_key= "my-key"
# credentials = ApiKeyConfiguration(key=api_key)

wps_connection = AzureOpenAIConnection(
    name=name,
    azure_endpoint=target,
    credentials=credentials,
    resource_id = resource_id,
    is_shared=False
)
ml_client.connections.create_or_update(wps_connection)

#### Use AIStudio's Azure OpenAI connection

In [12]:
from promptflow.rag.config import ConnectionConfig

embedding_model_config = ConnectionConfig(
    subscription_id = os.environ.get("SUBSCRIPTION_ID"),
    resource_group_name = os.environ.get("RESOURCE_GROUP_NAME"),
    workspace_name = os.environ.get("PROJECT_NAME"),
    connection_name = os.environ.get("AISTUDIO_AOAI_CONNECTION_NAME"),
    #connection_name= "mssecureai4034688619"
)

In [None]:
from promptflow.rag.config import LocalSource, EmbeddingsModelConfig
from promptflow.rag import build_index

faiss_index_name = "product-info-faiss-index"
embedding_output_dir = "../data"

# build the index
faiss_index=build_index(
    name=faiss_index_name,  # name of your index
    vector_store="faiss",  # the type of vector store
    embeddings_model_config=EmbeddingsModelConfig(
        model_name=os.getenv("AZURE_OPENAI_EMBEDDING_MODEL"),
        deployment_name=os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT"),
        connection_config=embedding_model_config
    ),
    input_source=LocalSource(input_data="../data/product-info/"),  # the location of your file/folders
    #index_config=LocalSource(input_data="../data/product-info/"
        #ai_search_index_name="<your-index-name>" + "-aoai-store", # the name of the index store inside the azure ai search service
    #),
    tokens_per_chunk = 800, # Optional field - Maximum number of tokens per chunk
    token_overlap_across_chunks = 0, # Optional field - Number of tokens to overlap between chunks
    embeddings_cache_path=embedding_output_dir, # Optional field - Path to store embeddings cache
)

#### Consume index

In [None]:
from promptflow.rag import get_langchain_retriever_from_index

# Get the OpenAI embedded Index
#retriever=get_langchain_retriever_from_index(faiss_index)
retriever=get_langchain_retriever_from_index(faiss_index)
retriever.get_relevant_documents("Which tent is the most waterproof")


#### Register Index (Optional)

In [None]:
# connect to the AI Studio project
from azure.identity import DefaultAzureCredential
from azure.ai.ml import MLClient

client=MLClient(
    DefaultAzureCredential(), 
    subscription_id=os.environ.get("SUBSCRIPTION_ID"),
    resource_group_name=os.environ.get("RESOURCE_GROUP_NAME"),
    workspace_name=os.environ.get("PROJECT_NAME") 
)

In [None]:
from azure.ai.ml.entities import Index

# register the index with Azure OpenAI embeddings
client.indexes.create_or_update(
    Index(name=faiss_index_name + "aoai", 
          path=faiss_index, 
          version="1")
          )

#### Option 2: Use Azure AI Search to create an index


In [10]:
from promptflow.rag.config import ConnectionConfig
embedding_model_config = ConnectionConfig(
    subscription_id = os.environ.get("SUBSCRIPTION_ID"),
    resource_group_name = os.environ.get("RESOURCE_GROUP_NAME"),
    workspace_name = os.environ.get("PROJECT_NAME"),
    connection_name = os.environ.get("AISTUDIO_AOAI_CONNECTION_NAME"),
    #connection_name = "mssecureai4034688619"
)

ais_model_config = ConnectionConfig(
    subscription_id = os.environ.get("SUBSCRIPTION_ID"),
    resource_group_name = os.environ.get("RESOURCE_GROUP_NAME"),
    workspace_name = os.environ.get("PROJECT_NAME"),
    connection_name = os.environ.get("AISTUDIO_AIS_CONNECTION_NAME"),
)

In [11]:
# connect to the AI Studio project
from azure.identity import DefaultAzureCredential
from azure.ai.ml import MLClient

client=MLClient(
    DefaultAzureCredential(), 
    subscription_id=os.environ.get("SUBSCRIPTION_ID"),
    resource_group_name=os.environ.get("RESOURCE_GROUP_NAME"),
    workspace_name=os.environ.get("PROJECT_NAME")
    )

In [None]:
from promptflow.rag.config import AzureAISearchConfig, EmbeddingsModelConfig, LocalSource
from promptflow.rag import build_index

ais_index_name = "product-info-ais-index"
embedding_output_dir = "../data"

local_index_aoai=build_index(
    name=ais_index_name,  # name of your index
    vector_store="azure_ai_search",  # the type of vector store
    embeddings_model_config=EmbeddingsModelConfig(
        model_name=os.getenv("AZURE_OPENAI_EMBEDDING_MODEL"),
        deployment_name=os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT"),
        connection_config=embedding_model_config
    ),
    input_source=LocalSource(input_data="../data/product-info/"),  # the location of your file/folders
    index_config=AzureAISearchConfig(
        ai_search_index_name="product-info-ais-index" + "-aoai-store", # the name of the index store inside the azure ai search service
        ai_search_connection_config=ais_model_config
    ),
    tokens_per_chunk = 800, # Optional field - Maximum number of tokens per chunk
    token_overlap_across_chunks = 0, # Optional field - Number of tokens to overlap between chunks
    embeddings_cache_path=embedding_output_dir, # Optional field - Path to store embeddings cache
)

### Consume the local index

In [None]:
from promptflow.rag import get_langchain_retriever_from_index

# Get the OpenAI embedded Index
retriever=get_langchain_retriever_from_index(local_index_aoai)
retriever.get_relevant_documents("Which tent is the most waterproof")



In [None]:
from azure.ai.ml.entities import Index
# register the index so that it shows up in the project
cloud_index = client.indexes.create_or_update(Index(name=ais_index_name, path=local_index_aoai))

print(f"Created index '{cloud_index.name}'")
print(f"Cloud Path: {cloud_index.path}")

### Use a Flow to evaluate

| Evaluator | question | answer | context | ground_truth |
| --- | --- | --- | --- | --- | 
| GroundednessEvaluator | N/A | Required: String | Required: String | N/A |
| RelevanceEvaluator | Required: String | Required: String | Required: String | N/A |
| CoherenceEvaluator | Required: String | Required: String | N/A | N/A |
| FluencyEvaluator | Required: String | Required: String | N/A | N/A |
| SimilarityEvaluator | Required: String | Required: String | N/A | Required: String |
| F1ScoreEvaluator | N/A | Required: String | N/A | Required: String |
| ViolenceEvaluator | Required: String | Required: String | N/A | N/A |
| SexualEvaluator | Required: String | Required: String | N/A | N/A |
| SelfHarmEvaluator | Required: String | Required: String | N/A | N/A |
| HateUnfairnessEvaluator | Required: String | Required: String | N/A | N/A |

In [15]:
import os
from promptflow.entities import AzureOpenAIConnection, CognitiveSearchConnection

model_connect = AzureOpenAIConnection(
    name=os.environ.get("AISTUDIO_AOAI_CONNECTION_NAME"),
    api_base=os.environ.get("AZURE_OPENAI_ENDPOINT"),
    api_type="azure",
    api_key=os.environ.get("AZURE_OPENAI_API_KEY"),
    #api_version=os.environ.get("AZURE_OPENAI_API_VERSION"),
)

ais_connect = CognitiveSearchConnection(
    name=os.environ.get("AISTUDIO_AIS_CONNECTION_NAME"),
    api_base=os.environ.get("AZURE_AI_SEARCH_ENDPOINT"),
    api_key=os.environ.get("AZURE_AI_SEARCH_KEY"),
)

In [16]:
myflow = "../3-metaprompt-grounding/prompt-flow/product-chat"

In [None]:
from promptflow.client import load_flow


flow_path = myflow
sample_input = '../data/evaluation_dataset.jsonl', # data to be evaluated

f = load_flow(source=flow_path)

f.context.connections = {"DetermineIntent": {"connection": model_connect}, 
                         "RetrieveDocuments": {"searchConnection": ais_connect, "embeddingModelConnection": model_connect}, 
                         "DetermineReply": {"connection": model_connect}}

result = f(url=sample_input)

print(result)

In [18]:
def copilot_wrapper(*, chat_input, **kwargs):
    from copilot_flow.copilot import get_chat_response

    result = get_chat_response(chat_input)

    parsedResult = {"answer": str(result["reply"]), "context": str(result["context"])}
    return parsedResult

In [None]:
from promptflow.evals.evaluate import evaluate

result = evaluate(
    target=copilot_wrapper,
    evaluation_name="qa-eval-with-flow", #name your evaluation to view in AI Studio
    data='../data/evaluation_dataset.jsonl', # data to be evaluated
    evaluators={
        "relevance": relevance_eval,
        "groundedness": groundedness_eval,
        "coherence": coherence_eval,
        "fluency": fluency_eval,
        "similarity": similarity_eval,
        "f1score": f1score_eval
    },
    evaluator_config={
        "relevance": {"question": "${data.question}"},
        "coherence": {"question": "${data.question}"},
        "groundedness": {"question": "${data.question}"},
        "fluency": {"question": "${data.answer}"},
        
                "default": {
            "questions": "${data.question)", #column of data providing input to model
            #"contexts": "${data.context}", #column of data providing context for each input
            "answer": "${target.answer}", #column of data providing output from model
            "ground_truth":"${data.truth}" #column of data providing ground truth answer, optional for default metrics
        }
    },
    # to log evaluation to the cloud AI Studio project
    azure_ai_project=azure_ai_project
)

In [None]:
from promptflow.evals.evaluate import evaluate

result = evaluate(
    target=copilot_wrapper,
    evaluation_name="qa-eval-with-flow", #name your evaluation to view in AI Studio
    data='../data/evaluation_dataset.jsonl', # data to be evaluated
    evaluators={
        "relevance": relevance_eval,
        "groundedness": groundedness_eval,
        "coherence": coherence_eval,
        "fluency": fluency_eval,
        "similarity": similarity_eval,
        "f1score": f1score_eval
    },
    evaluator_config={
        "relevance": {"question": "${data.question}"},
        "coherence": {"question": "${data.question}"},
        "groundedness": {"question": "${data.question}"},
        "fluency": {"question": "${data.answer}"},
        
                "default": {
            "questions": "${data.question)", #column of data providing input to model
            #"contexts": "${data.context}", #column of data providing context for each input
            "answer": "${target.answer}", #column of data providing output from model
            "ground_truth":"${data.truth}" #column of data providing ground truth answer, optional for default metrics
        }
    },
    # to log evaluation to the cloud AI Studio project
    azure_ai_project=azure_ai_project
)

In [None]:
from app_target import ModelEndpoints
import pathlib

from promptflow.evals.evaluate import evaluate
from promptflow.evals.evaluators import (
    ContentSafetyEvaluator,
    RelevanceEvaluator,
    CoherenceEvaluator,
    GroundednessEvaluator,
    FluencyEvaluator,
    SimilarityEvaluator,
)


content_safety_evaluator = ContentSafetyEvaluator(project_scope=azure_ai_project)
relevance_evaluator = RelevanceEvaluator(model_config=configuration)
coherence_evaluator = CoherenceEvaluator(model_config=configuration)
groundedness_evaluator = GroundednessEvaluator(model_config=configuration)
fluency_evaluator = FluencyEvaluator(model_config=configuration)
similarity_evaluator = SimilarityEvaluator(model_config=configuration)

models = [
    "gpt4-0613",
    "gpt35-turbo",
    "mistral7b",
    "phi3_mini_serverless",
    "tiny_llama",
    "gpt2",
]

path = str(pathlib.Path(pathlib.Path.cwd())) + "/data.jsonl"

for model in models:
    randomNum = random.randint(1111, 9999)
    results = evaluate(
        azure_ai_project=azure_ai_project,
        evaluation_name="Eval-Run-" + str(randomNum) + "-" + model.title(),
        data=path,
        target=ModelEndpoints(env_var, model),
        evaluators={
            "content_safety": content_safety_evaluator,
            "coherence": coherence_evaluator,
            "relevance": relevance_evaluator,
            "groundedness": groundedness_evaluator,
            "fluency": fluency_evaluator,
            "similarity": similarity_evaluator,
        },
        evaluator_config={
            "content_safety": {"question": "${data.question}", "answer": "${target.answer}"},
            "coherence": {"answer": "${target.answer}", "question": "${data.question}"},
            "relevance": {"answer": "${target.answer}", "context": "${data.context}", "question": "${data.question}"},
            "groundedness": {
                "answer": "${target.answer}",
                "context": "${data.context}",
                "question": "${data.question}",
            },
            "fluency": {"answer": "${target.answer}", "context": "${data.context}", "question": "${data.question}"},
            "similarity": {"answer": "${target.answer}", "context": "${data.context}", "question": "${data.question}"},
        },
    )


#### Use Model Inference API
https://learn.microsoft.com/en-us/azure/machine-learning/reference-model-inference-api

The Azure AI Model Inference API is available in the following models:

Models deployed to serverless API endpoints:

* [Cohere Embed V3](https://learn.micrsoft.com/en-us/azure/machine-learning/how-to-deploy-models-cohere-embed?view=azureml-api-2) family of models
* [Cohere Command R](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-deploy-models-cohere-command?view=azureml-api-2) family of models
* [Meta Llama 2](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-deploy-models-llama?view=azureml-api-2) chat family of models
* [Meta Llama 3 instruct](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-deploy-models-llama?view=azureml-api-2) family of models
* [Mistral-Small](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-deploy-models-mistral?view=azureml-api-2)
* [Mistral-Large](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-deploy-models-mistral?view=azureml-api-2)
* [Jais](https://learn.microsoft.com/en-us/azure/machine-learning/deploy-jais-models?view=azureml-api-2) family of models
* [Jamba](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-deploy-models-jamba?view=azureml-api-2) family of models
* [Phi-3](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-deploy-models-phi-3?view=azureml-api-2) family of models

Models deployed to [managed inference](https://learn.microsoft.com/en-us/azure/machine-learning/concept-endpoints-online?view=azureml-api-2):

* [Meta Llama 3 instruct](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-deploy-models-llama?view=azureml-api-2) family of models
* [Phi-3](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-deploy-models-phi-3?view=azureml-api-2) family of models
* Mixtral family of models

The API is compatible with Azure OpenAI model deployments.

In [None]:
# pip install azure-ai-inference

In [None]:
# key based auth

# import os
# from azure.ai.inference import ChatCompletionsClient
# from azure.core.credentials import AzureKeyCredential

# client = ChatCompletionsClient(
#     endpoint=os.environ["AZUREAI_ENDPOINT_URL"],
#     credential=AzureKeyCredential(os.environ["AZUREAI_ENDPOINT_KEY"]),
# )

# or EntraID
import os
from azure.ai.inference import ChatCompletionsClient
from azure.identity import AzureDefaultCredential

client = ChatCompletionsClient(
    endpoint=os.environ["AZUREAI_ENDPOINT_URL"],
    credential=AzureDefaultCredential(),
)

### Generate ground truth

In [None]:
#pip install azure-ai-generative[simulator]
from azure.ai.generative.synthetic.simulator import Simulator # Release into evals-synthetic

In [None]:
from promptflow.evals.synthetic import AdversarialSimulator

In [None]:
async def callback(
    messages: List[Dict],
    stream: bool = False,
    session_state: Any = None,
) -> dict:
    query = messages["messages"][0]["content"]
    context = None

    # Add file contents for summarization or re-write
    if 'file_content' in messages["template_parameters"]:
        query += messages["template_parameters"]['file_content']
    
    # Call your own endpoint and pass your query as input. Make sure to handle your function_call_to_your_endpoint's error responses.
    response = await function_call_to_your_endpoint(query) 
    
    # Format responses in OpenAI message protocol
    formatted_response = {
        "content": response,
        "role": "assistant",
        "context": {},
    }

    messages["messages"].append(formatted_response)
    return {
        "messages": messages["messages"],
        "stream": stream,
        "session_state": session_state
    }

In [None]:
from promptflow.evals.synthetic import AdversarialScenario

scenario = AdversarialScenario.ADVERSARIAL_QA
simulator = AdversarialSimulator(azure_ai_project=azure_ai_project)

outputs = await simulator(
        scenario=scenario, # required adversarial scenario to simulate
        target=callback, # callback function to simulate against
        max_conversation_turns=1, #optional, applicable only to conversation scenario
        max_simulation_results=3, #optional
        jailbreak=False #optional
    )

# By default simulator outputs json, use the following helper function to convert to QA pairs in jsonl format
print(outputs.to_eval_qa_json_lines())

In [None]:
from azure.identity import DefaultAzureCredential
from azure.ai.resources.client import AIClient
from azure.ai.resources.entities import AzureOpenAIModelConfiguration

# initialize ai_client. This assums that config.json downloaded from ai workspace is present in the working directory
ai_client = AIClient.from_config(DefaultAzureCredential())
# Retrieve default aoai connection if it exists
aoai_connection = ai_client.get_default_aoai_connection()
# alternatively, retrieve connection by name
# aoai_connection = ai_client.connections.get("<name of connection>")

# # Specify model and deployment name for your system large language model
# aoai_config = AzureOpenAIModelConfiguration.from_connection(
#     connection=aoai_connection,
#     model_name=os.getenv('AZURE_OPENAI_EVALUATION_MODEL'),
#     deployment_name=os.getenv('AZURE_OPENAI_EVALUATION_DEPLOYMENT'),
#     temperature=0.1,
#     max_tokens=300
# )
# # Specify model and deployment name for your system large language model
aoai_config = AzureOpenAIModelConfiguration.from_connection(
    connection=aoai_connection,
    model_name='gpt-4-32k',
    deployment_name='gpt-4-32k-0613',
    temperature=0.1,
    max_tokens=300
)

In [None]:
import os

In [None]:
from openai import AsyncAzureOpenAI
oai_client = AsyncAzureOpenAI(api_key=os.getenv('AZURE_OPENAI_KEY'), azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT'), api_version="2024-02-15-preview")
async_oai_chat_completion_fn = oai_client.chat.completions.create

In [None]:
function_simulator = Simulator.from_fn(
    fn=async_oai_chat_completion_fn, # Simulate against a local function OR callback function
    simulator_connection=aoai_config # Configure the simulator
) 

In [None]:
template = Simulator.get_template("summarization")

In [None]:
template_params = [
    {
        "name": "John Doe",
        "chatbot_name": "AI Chatbot",
        "filename": "company_report.txt",
        "file_content": "The company is doing well. The stock price is up 10% this quarter. The company is expanding into new markets. The company is investing in new technology. The company is hiring new employees. The company is launching new products. The company is opening new stores. The company is increasing its market share. The company is increasing its revenue. The company is increasing its profits.",
    },
    {
        "name": "Jane Doe",
        "chatbot_name": "AI Chatbot",
        "filename": "sales_report.txt",
        "file_content": "The sales team is doing well. The sales team is meeting its targets. The sales team is increasing its revenue. The sales team is increasing its market share. The sales team is increasing its profits. The sales team is expanding into new markets. The sales team is launching new products. The sales team is opening new stores. The sales team is hiring new employees. The sales team is investing in new technology.",
    },
]

In [None]:
outputs = await function_simulator.simulate_async(
    template,
    parameters=template_params,
    max_conversation_turns=2,
    api_call_delay_sec=10,
    max_simulation_results=10,
)

### Generate QA from files

In [None]:
from pathlib import Path
# product sample data
texts_glob = Path("../data/product-info/")
# azureai-samples data
#texts_glob = Path("../../azureai-samples/scenarios/generate-synthetic-data/ai-generated-data-qna/data/data_generator_texts/")
files = Path.glob(texts_glob, pattern="**/*")
files = [file for file in files if Path.is_file(file)]

In [None]:
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import NLTKTextSplitter
import nltk

# download pre-trained Punkt tokenizer for sentence splitting
nltk.download("punkt")

text_splitter = NLTKTextSplitter.from_tiktoken_encoder(
    encoding_name="cl100k_base",  # encoding for gpt-4 and gpt-35-turbo
    chunk_size=300,  # number of tokens to split on
    chunk_overlap=0,
)
texts = []
for file in files:
    loader = UnstructuredFileLoader(file)
    docs = loader.load()
    data = docs[0].page_content
    texts += text_splitter.split_text(data)
print(f"Number of texts after splitting: {len(texts)}")

In [None]:
from azure.ai.generative.synthetic.qa import QADataGenerator, QAType

## Uses AzureOpenAI environment variables

# For granular logs you may set DEBUG log level:
import logging
#logging.basicConfig(level=logging.DEBUG)
logging.basicConfig(level=logging.ERROR)

model_config = {
    "deployment": "gpt-4-1106-preview",
    "model": "gpt-4",
    "max_tokens": 2000,
}

qa_generator = QADataGenerator(model_config=model_config)

In [None]:
QADataGenerator(model_config=model_config)

#### Generate QA asynchronously

In [None]:
from azure.ai.generative.synthetic.qa import QADataGenerator, QAType
import asyncio
from collections import Counter
from typing import Dict

concurrency = 3  # number of concurrent calls
sem = asyncio.Semaphore(concurrency)

qa_type = QAType.CONVERSATION


async def generate_async(text: str) -> Dict:
    async with sem:
        return await qa_generator.generate_async(
            text=text,
            qa_type=qa_type,
            num_questions=3,  # Number of questions to generate per text
        )


results = await asyncio.gather(*[generate_async(text) for text in texts], return_exceptions=True)

question_answer_list = []
token_usage = Counter()
for result in results:
    if isinstance(result, Exception):
        raise result  # exception raised inside generate_async()
    question_answer_list.append(result["question_answers"])
    token_usage += result["token_usage"]

print("Successfully generated QAs")

In [None]:
print(f"Tokens used: {result['token_usage']}")

In [None]:
question_answer_list

### Save the generated data for later use
Let us save the generated QnA in a format which can be understood by prompt flow (for evaluation, batch runs). 

In [None]:
import os
generated_dir = "../data/generated_qa"
os.makedirs(generated_dir, exist_ok=True)
output_file = os.path.join(generated_dir, "generated_qa.jsonl")
qa_generator.export_to_file(output_file, qa_type, question_answer_list)

## Messing

In [None]:
# # try using connection config
# # from promptflow.rag.config import ConnectionConfig

# model_connect_config = ConnectionConfig(
#     subscription_id = os.environ.get("SUBSCRIPTION_ID"),
#     resource_group_name = os.environ.get("RESOURCE_GROUP_NAME"),
#     workspace_name = os.environ.get("PROJECT_NAME"),
#     connection_name = "mssecureai4034688619"

# model_connect = AzureOpenAIModelConfiguration.from_connection(model_connect_config)