### Load environment variables

In [1]:
from dotenv import load_dotenv
import os
load_dotenv('../.env')

True

### Define AI Studio connection

In [5]:
# define the Azure AI Studio connection
azure_ai_project = {
    "subscription_id": os.environ.get("SUBSCRIPTION_ID"),
    "resource_group_name": os.environ.get("RESOURCE_GROUP_NAME"),
    "project_name": os.environ.get("PROJECT_NAME"),
    #"credential": token_provider,
}

### Load the dataset

In [2]:
import json
def load_jsonl(path):
    with open(path, "r") as f:
        return [json.loads(line) for line in f.readlines()]
    
mydata = load_jsonl('../data/evaluation_dataset.jsonl')

In [3]:
# create directory for output
output_dir = '../data/evaluate/aoai/'
os.makedirs(output_dir, exist_ok=True)

### Evaluate Azure OpenAI using RAG (index created in setup)

In [12]:
#from azure.ai.generative.evaluate import evaluate
from promptflow.evals.evaluate import evaluate

result = evaluate( 
    evaluation_name="azure-openai-qa-eval", #name your evaluation to view in AI Studio
    data=mydata, # data to be evaluated
    task_type="qa", # for different task types, different metrics are available
    metrics_list=["gpt_groundedness", "gpt_relevance", "gpt_coherence", "gpt_fluency", "gpt_similarity"], #optional superset over default set of metrics
    model_config= { #for AI-assisted metrics, need to hook up AOAI GPT model for doing the measurement
            "api_version": "2023-05-15",
            "api_base": os.getenv("AZURE_OPENAI_ENDPOINT"),
            "api_type": "azure",
            "api_key": os.getenv("AZURE_OPENAI_KEY"),
            "deployment_id": os.getenv("AZURE_OPENAI_EVALUATION_DEPLOYMENT")
    },
    data_mapping={
        "questions":"question", #column of data providing input to model
        "contexts":"context", #column of data providing context for each input
        "answer":"answer", #column of data providing output from model
        "ground_truth":"groundtruth" #column of data providing ground truth answer, optional for default metrics
        },
    # Optionally provide your AI Studio project information to track your evaluation results in your Azure AI studio project
    azure_ai_project = azure_ai_project,
    # Optionally provide an output path to dump a json of metric summary, row level data and metric and studio URL
    output_path=output_dir
)


ValueError: data must be a string.

In [11]:
from promptflow.evals.evaluate import evaluate

result = evaluate(
    evaluation_name="rai-workshop-test", #name your evaluation to view in AI Studio
    data=mydata, # provide your data here - must be string
    target=llm_tool,
    evaluators={
        #"relevance": relevance_eval,
        "coherence": coherence_eval,
        #"groundedness": groundedness_eval,
        "fluency": fluency_eval,
        "similarity": similarity_eval,
        "f1_score": f1score_eval

    },
    # column mapping
    evaluator_config={
        "default": {
            "question": "${data.question)", #column of data providing input to model
            #"contexts": "${data.context}", #column of data providing context for each input
            "answer": "${target.answer}", #column of data providing output from model
            "ground_truth":"${data.truth}" #column of data providing ground truth answer, optional for default metrics
        }
    },
    # Optionally provide your AI Studio project information to track your evaluation results in your Azure AI studio project
    azure_ai_project = azure_ai_project,
    # Optionally provide an output path to dump a json of metric summary, row level data and metric and studio URL
    output_path=output_dir
)

NameError: name 'llm_tool' is not defined

In [13]:
# Show average results
print(result._metrics_summary)

{'gpt_coherence': 5.0, 'gpt_similarity': nan, 'gpt_fluency': 5.0, 'gpt_relevance': 5.0, 'gpt_groundedness': 5.0}


In [12]:
print(result.studio_url)

https://ai.azure.com/build/evaluation/13d44513-6a55-4f4b-ae9e-926284ed93d3?wsid=/subscriptions/3c8972d9-f541-46b2-b70b-d81baba3595d/resourceGroups/aistudio-rg/providers/Microsoft.MachineLearningServices/workspaces/aistudio-ai-aiproj


### Repeat, but this time with another model

In [None]:
from azure.ai.generative.evaluate import evaluate

result = evaluate( 
    evaluation_name="azure-phi2-qa-eval", #name your evaluation to view in AI Studio
    data=mydata, # data to be evaluated
    task_type="qa", # for different task types, different metrics are available
    metrics_list=["gpt_groundedness", "gpt_relevance", "gpt_coherence", "gpt_fluency", "gpt_similarity"], #optional superset over default set of metrics
    model_config= { #for AI-assisted metrics, need to hook up AOAI GPT model for doing the measurement
            "api_version": "2023-05-15",
            "api_base": os.getenv("AZURE_OPENAI_ENDPOINT"),
            "api_type": "azure",
            "api_key": os.getenv("AZURE_OPENAI_KEY"),
            "deployment_id": os.getenv("AZURE_OPENAI_EVALUATION_DEPLOYMENT")
    },
    data_mapping={
        "questions":"question", #column of data providing input to model
        "contexts":"context", #column of data providing context for each input
        "answer":"answer", #column of data providing output from model
        "ground_truth":"groundtruth" #column of data providing ground truth answer, optional for default metrics
        },
        output_path=output_dir, #optional: save output artifacts to local folder path 
    tracking_uri=client.tracking_uri, #optional: if configured with AI client, evaluation gets logged to AI Studio   
)
