### Load environment variables

In [1]:
from dotenv import load_dotenv
import os
load_dotenv('../.env')

True

### Create AI client to connect to Studio

In [2]:
from azure.identity import DefaultAzureCredential
from azure.ai.resources.client import AIClient

credential = DefaultAzureCredential(exclude_shared_token_cache_credential=True)

try:
    client = AIClient.from_config(credential)
except Exception as ex:
    print(ex)
    # Enter details of your AML workspace
    client = AIClient(credential=DefaultAzureCredential(), subscription_id=os.getenv("SUBSCRIPTION_ID"),
                   resource_group_name=os.getenv("RESOURCE_GROUP_NAME"), project_name=os.getenv("PROJECT_NAME"))

Class AIClient: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


### Load the dataset

In [3]:
import json
def load_jsonl(path):
    with open(path, "r") as f:
        return [json.loads(line) for line in f.readlines()]
    
mydata = load_jsonl('../data/evaluation_dataset.jsonl')

In [4]:
# create directory for output
output_dir = '../data/evaluate/aoai/'
os.makedirs(output_dir, exist_ok=True)

### Evaluate Azure OpenAI using RAG (index created in setup)

In [6]:
from azure.ai.generative.evaluate import evaluate

result = evaluate( 
    evaluation_name="azure-openai-qa-eval", #name your evaluation to view in AI Studio
    data=mydata, # data to be evaluated
    task_type="qa", # for different task types, different metrics are available
    metrics_list=["gpt_groundedness", "gpt_relevance", "gpt_coherence", "gpt_fluency", "gpt_similarity"], #optional superset over default set of metrics
    model_config= { #for AI-assisted metrics, need to hook up AOAI GPT model for doing the measurement
            "api_version": "2023-05-15",
            "api_base": os.getenv("AZURE_OPENAI_ENDPOINT"),
            "api_type": "azure",
            "api_key": os.getenv("AZURE_OPENAI_KEY"),
            "deployment_id": os.getenv("AZURE_OPENAI_EVALUATION_DEPLOYMENT")
    },
    data_mapping={
        "questions":"question", #column of data providing input to model
        "contexts":"context", #column of data providing context for each input
        "answer":"answer", #column of data providing output from model
        "ground_truth":"groundtruth" #column of data providing ground truth answer, optional for default metrics
        },
        output_path=output_dir, #optional: save output artifacts to local folder path 
    tracking_uri=client.tracking_uri, #optional: if configured with AI client, evaluation gets logged to AI Studio   
)


  outputs.fillna(value="(Failed)", inplace=True)  # replace nan with explicit prompt


In [13]:
# Show average results
print(result._metrics_summary)

{'gpt_coherence': 5.0, 'gpt_similarity': nan, 'gpt_fluency': 5.0, 'gpt_relevance': 5.0, 'gpt_groundedness': 5.0}


In [12]:
print(result.studio_url)

https://ai.azure.com/build/evaluation/13d44513-6a55-4f4b-ae9e-926284ed93d3?wsid=/subscriptions/3c8972d9-f541-46b2-b70b-d81baba3595d/resourceGroups/aistudio-rg/providers/Microsoft.MachineLearningServices/workspaces/aistudio-ai-aiproj


### Repeat, but this time with another model

In [None]:
from azure.ai.generative.evaluate import evaluate

result = evaluate( 
    evaluation_name="azure-phi2-qa-eval", #name your evaluation to view in AI Studio
    data=mydata, # data to be evaluated
    task_type="qa", # for different task types, different metrics are available
    metrics_list=["gpt_groundedness", "gpt_relevance", "gpt_coherence", "gpt_fluency", "gpt_similarity"], #optional superset over default set of metrics
    model_config= { #for AI-assisted metrics, need to hook up AOAI GPT model for doing the measurement
            "api_version": "2023-05-15",
            "api_base": os.getenv("AZURE_OPENAI_ENDPOINT"),
            "api_type": "azure",
            "api_key": os.getenv("AZURE_OPENAI_KEY"),
            "deployment_id": os.getenv("AZURE_OPENAI_EVALUATION_DEPLOYMENT")
    },
    data_mapping={
        "questions":"question", #column of data providing input to model
        "contexts":"context", #column of data providing context for each input
        "answer":"answer", #column of data providing output from model
        "ground_truth":"groundtruth" #column of data providing ground truth answer, optional for default metrics
        },
        output_path=output_dir, #optional: save output artifacts to local folder path 
    tracking_uri=client.tracking_uri, #optional: if configured with AI client, evaluation gets logged to AI Studio   
)
