# Azure OpenAI Audio Evaluation Demo

A clean demonstration of Azure OpenAI audio evaluation workflow.

## Prerequisites
- Azure OpenAI service with audio model deployment
- API key configured in `.env` file

In [None]:
from dotenv import load_dotenv
from scripts.eval_utils import AsyncEvalClient

# Load environment variables
load_dotenv()

# Setup evaluation client
client = AsyncEvalClient()
print("ðŸŽ‰ Azure OpenAI Evaluation Client ready!")

AUDIO_MODEL_DEPLOYMENT_NAME = "gpt-4o-audio-preview"

## Load and Create Audio Dataset

Load audio samples from HuggingFace and create an evaluation dataset file in JSONL format. The function will:
- Download the audio dataset from hugging face
- Convert audio to base64-encoded WAV format
- Prepare the file for evaluation.
- Display first 3 rows from the dataset.

In [None]:
from scripts.audio_utils import load_and_create_audio_dataset, display_items

# Load dataset from Hugging Face and create the audio dataset for evaluation
load_and_create_audio_dataset("AbstractTTS/CREMA-D")

# Display the created evaluation file
display_items(3)

In [None]:
# Upload the evaluation file to Azure OpenAI
eval_file_id = await client.upload_file(
    file_name="audio_emotion_evaluation.jsonl",
    file_path="./data/audio_emotion_evaluation.jsonl")
print(f"âœ… Eval file ID: {eval_file_id}")

## Upload Evaluation Dataset

Upload the prepared audio evaluation file to your Azure OpenAI account. This file contains base64-encoded audio samples with expected emotions that will be used as the data source for running evaluations.

## Initialize Azure OpenAI Client



In [None]:
score_model = {
      "type": "score_model",
      "name": "Tone/Emotion Grader",
      "model": AUDIO_MODEL_DEPLOYMENT_NAME,
      "input": [
        {
          "role": "system",
          "content": "You are a helpful assistant that evaluates audio clips to judge whether they match a provided {{item.expected_emotion}}. The audio clip is the model''s prediction of emotion. Result must be a float in [0.0, 1.0] similarity to {{item.expected_emotion}}, where 1.0 means the speakerâ€™s tone exactly matches the expected emotion and 0.0 means it does not match at all. Do not return other text."
        },
        {
          "role": "user",
          "content": [
            {
              "type": "input_audio",
              "input_audio": {
                "data": "{{ sample.output_audio.data }}",
                "format": "wav"
              }
            }
          ]
        }
      ],
      "range": [
        0,
        1
      ],
      "pass_threshold": 0.5
    }

eval_id = await client.create_eval_sdk(
    name="Audio Emotion Evaluation",
    testing_criteria=[score_model],
    data_source_config={
    "type": "custom",
    "item_schema": {
      "type": "object",
      "properties": {
        "audio_data": {
          "type": "string",
          "description": "Base64-encoded WAV audio data."
        },
        "expected_emotion": {
          "type": "string",
          "description": "The expected primary emotion in the audio."
        }
      },
      "required": [
        "audio_data",
        "expected_emotion"
      ]
    },
    "include_sample_schema": True,
  })

## Create Evaluation with Audio Emotion Grader

Define a score model grader that evaluates the emotional tone of the model's audio responses. The grader:
- Listens to the model's audio output
- Compares it against the expected emotion from the dataset
- Returns a similarity score from 0.0 (no match) to 1.0 (exact match)
- Sets a passing threshold of 0.5 for acceptable emotion matching

## Run the Evaluation

Configure the data source and input messages for the evaluation run. This defines:
- The audio model to use for emotion identification
- System prompt instructing the model to analyze audio emotions
- User instructions to identify emotions from the audio input
- How to format multi-modal messages with text and audio
- Temperature and modality settings (text and audio output)

In [None]:
data_source = {
    "type": "completions",
    "model": AUDIO_MODEL_DEPLOYMENT_NAME,
    "sampling_params": {
      "temperature": 0.8
    },
    "modalities": [
      "text",
      "audio"
    ],
    "source": {
      "type": "file_id",
      "id": eval_file_id
    },
    "input_messages": {
      "type": "template",
      "template": [
        {
          "role": "system",
          "content": "You are a assistant that tells the emotion of audio input. You will be given an audio input."
        },
        {
          "role": "user",
          "type": "message",
          "content": {
            "type": "input_text",
            "text": "Listen to the audio and identify the primary emotion. Respond with exactly one word from: anger, fear, disgust, happy, sad."
          }
        },
        {
          "role": "user",
          "type": "message",
          "content": {
            "type": "input_audio",
            "input_audio": {
              "data": "{{item.audio_data}}",
              "format": "wav"
            }
          }
        }
      ]
    }
}

run = await client.create_eval_run_sdk(eval_id, "Audio Emotion Evaluation", data_source)
run_id = run['id']


## Monitor and Display Results

Poll the evaluation run status until completion, then retrieve and display the results in a DataFrame showing:
- Item IDs
- Grading results (pass/fail status based on emotion matching)
- Expected emotions from the dataset
- Model's identified emotions from the audio transcripts

In [None]:
import asyncio
import pandas as pd

while True:
    run = await client.get_eval_run_sdk(eval_id=eval_id, run_id=run_id)
    if run['status'] == "completed":
        output_items_response = await client.get_eval_run_output_items_sdk(
            eval_id=eval_id, run_id=run_id)

        # Get the actual list of items from the response object
        output_items = output_items_response.data if hasattr(output_items_response, 'data') else output_items_response

        # Create DataFrame with safe access to nested fields
        df_data = {
            "id": [],
            "grading_results": [],
            "expected_emotion": [],
            "audio_output": []
        }

        for item in output_items:
            # Convert Pydantic model to dict if needed
            item_dict = item.model_dump() if hasattr(item, 'model_dump') else item
            
            df_data["id"].append(item_dict.get("id", "N/A"))
            df_data["grading_results"].append(item_dict.get("status", "N/A"))
            
            # Safely get expected emotion
            datasource_item = item_dict.get('datasource_item', {})
            df_data["expected_emotion"].append(datasource_item.get("expected_emotion", "N/A"))
            
            # Check if audio output exists
            sample = item_dict.get("sample", {})
            output = sample.get("output", {})
            output_transcript = output[0].get("content")
            df_data["audio_output"].append(output_transcript)

        df = pd.DataFrame(df_data)
        display(df)
        break
    if run['status'] == "failed":
        print("Evaluation run failed:")
        print(run.get('error', 'Unknown error'))
        break
    print(f"Status: {run['status']}. Waiting...")
    await asyncio.sleep(5)
