# Azure OpenAI Audio Evaluation Demo

A clean demonstration of Azure OpenAI audio evaluation workflow.

## Prerequisites
- Azure OpenAI service with audio model deployment
- API key configured in `.env` file

In [1]:
from dotenv import load_dotenv
from scripts.eval_utils import AsyncEvalClient

# Load environment variables
load_dotenv()

# Setup evaluation client
client = AsyncEvalClient()
print("ðŸŽ‰ Azure OpenAI Evaluation Client ready!")

AUDIO_MODEL_DEPLOYMENT_NAME = "gpt-4o-audio-preview"

ðŸŽ‰ Azure OpenAI Evaluation Client ready!


## Load and Create Audio Dataset

Load audio samples from HuggingFace and create an evaluation dataset file in JSONL format. The function will:
- Download the audio dataset from hugging face
- Convert audio to base64-encoded WAV format
- Prepare the file for evaluation.
- Display first 3 rows from the dataset.

In [2]:
from scripts.audio_utils import load_and_create_audio_dataset, display_items

# Load dataset from Hugging Face and create the audio dataset for evaluation
load_and_create_audio_dataset("AbstractTTS/CREMA-D")

# Display the created evaluation file
display_items(3)

  from .autonotebook import tqdm as notebook_tqdm


âœ… Created evaluation file with 100 items
{
  "item": {
    "audio_data": "UklGRpYcAQBXQVZFZm10IBAAAAABAAEAgD4AAAB9AAACABAAZGF0YXIcAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA

In [3]:
# Upload the evaluation file to Azure OpenAI
eval_file_id = await client.upload_file(
    file_name="audio_emotion_evaluation.jsonl",
    file_path="./data/audio_emotion_evaluation.jsonl")
print(f"âœ… Eval file ID: {eval_file_id}")

File uploaded successfully to Azure.
âœ… Eval file ID: file-d1f77b56b37f46a8adb9e001c8256862


## Setup score model grader.



In [4]:
score_model = {
      "type": "score_model",
      "name": "Tone/Emotion Grader",
      "model": AUDIO_MODEL_DEPLOYMENT_NAME,
      "input": [
        {
          "role": "system",
          "content": "You are an audio tone analyzer. Listen to the audio and provide an accurate primary emotion. Return a float score in [0,1] where 1 means the speaker tone/emotion is same as {{item.expected_emotion}}."
        },
        {
          "role": "user",
          "content": [
            {
              "type": "input_audio",
              "input_audio": {
                "data": "{{ sample.output_audio.data }}",
                "format": "wav"
              }
            }
          ]
        }
      ],
      "range": [
        0,
        1
      ],
      "pass_threshold": 0.5
    }

eval_id = await client.create_eval_sdk(
    name="Audio Emotion Evaluation",
    testing_criteria=[score_model],
    data_source_config={
    "type": "custom",
    "item_schema": {
      "type": "object",
      "properties": {
        "audio_data": {
          "type": "string",
          "description": "Base64-encoded WAV audio data."
        },
        "expected_emotion": {
          "type": "string",
          "description": "The expected primary emotion in the audio."
        }
      },
      "required": [
        "audio_data",
        "expected_emotion"
      ]
    },
    "include_sample_schema": True,
  })

Evaluation created successfully with ID: eval_69004977bb3c819183fb281e7603744e


In [5]:
data_source = {
    "type": "completions",
    "model": AUDIO_MODEL_DEPLOYMENT_NAME,
    "sampling_params": {
      "temperature": 0.8
    },
    "modalities": [
      "text",
      "audio"
    ],
    "source": {
      "type": "file_id",
      "id": eval_file_id
    },
    "input_messages": {
      "type": "template",
      "template": [
        {
          "role": "system",
          "content": "You are an assistant that can analyze audio input for emotion and tone. You will be given an audio input to analyze."
        },
        {
          "role": "user",
          "type": "message",
          "content": {
            "type": "input_text",
            "text": "Listen to the following audio and identify the primary emotion/tone. Respond with audio that matches the same emotion. Keep your response under 10 seconds."
          }
        },
        {
          "role": "user",
          "type": "message",
          "content": {
            "type": "input_audio",
            "input_audio": {
              "data": "{{item.audio_data}}",
              "format": "wav"
            }
          }
        }
      ]
    }
}

run = await client.create_eval_run_sdk(eval_id, "Audio Emotion Evaluation", data_source)
run_id = run['id']


Created evaluation run for Audio Emotion Evaluation: evalrun_690049798d84819184ff99b04009e1e2


In [None]:
import asyncio
import pandas as pd

while True:
    run = await client.get_eval_run_sdk(eval_id=eval_id, run_id=run_id)
    if run['status'] == "completed":
        output_items_response = await client.get_eval_run_output_items_sdk(
            eval_id=eval_id, run_id=run_id)

        # Get the actual list of items from the response object
        output_items = output_items_response.data if hasattr(output_items_response, 'data') else output_items_response

        # Create DataFrame with safe access to nested fields
        df_data = {
            "id": [],
            "grading_results": [],
            "expected_emotion": [],
            "audio_output": []
        }

        for item in output_items:
            # Convert Pydantic model to dict if needed
            item_dict = item.model_dump() if hasattr(item, 'model_dump') else item
            
            df_data["id"].append(item_dict.get("id", "N/A"))
            df_data["grading_results"].append(item_dict.get("status", "N/A"))
            
            # Safely get expected emotion
            datasource_item = item_dict.get('datasource_item', {})
            df_data["expected_emotion"].append(datasource_item.get("expected_emotion", "N/A"))
            
            # Check if audio output exists
            sample = item_dict.get("sample", {})
            output = sample.get("output", {})
            output_transcript = output[0].get("content")
            df_data["audio_output"].append(output_transcript)

        df = pd.DataFrame(df_data)
        display(df)
        break
    if run['status'] == "failed":
        print("Evaluation run failed:")
        print(run.get('error', 'Unknown error'))
        break
    print(f"Status: {run['status']}. Waiting...")
    await asyncio.sleep(5)


Sample output item:
OutputItemListResponse(id='outputitem_6900499b99d8819184b844666ed147aa', created_at=1761626523, datasource_item={'audio_data': '[large blob omitted]', 'expected_emotion': 'fear'}, datasource_item_id=96, eval_id='eval_69004977bb3c819183fb281e7603744e', object='eval.run.output_item', results=[Result(name='Tone/Emotion Grader-7d4eb506-5a93-4b8a-9134-375854cdc320', passed=False, score=0.1, sample={'input': [{'role': 'developer', 'content': 'Respond ONLY with a single JSON object matching: {"steps":[{"description":"string","conclusion":"string"}],"result":number}. Do not include any extra text. result must be a float in [0.0, 1.0].'}, {'role': 'system', 'content': 'You are an audio tone analyzer. Listen to the audio and provide an accurate primary emotion. Return a float score in [0,1] where 1 means the speaker tone/emotion is same as fear.'}, {'role': 'user', 'content': '<audio format=wav>[base64 audio omitted]</audio>'}], 'output': [{'role': 'assistant', 'content': '{\

Unnamed: 0,id,grading_results,expected_emotion,audio_output
0,outputitem_6900499b99d8819184b844666ed147aa,fail,fear,"<audio transcript=The tone seems neutral, with..."
1,outputitem_6900499b09a081919507441f005efae5,fail,fear,<audio transcript=The tone of the audio is neu...
2,outputitem_6900499af8bc8191b1162353f1e0e50d,fail,fear,<audio transcript=The tone of the audio is neu...
3,outputitem_6900499aeb748191982d69fd3080f5af,fail,happy,<audio transcript=The tone of the audio is neu...
4,outputitem_6900499a5d848191a4bc6660bc71c3ae,fail,disgust,<audio transcript=The tone of the audio is urg...
5,outputitem_6900499a4a848191a141b63fff92ced1,fail,happy,<audio transcript=The tone of the audio is neu...
6,outputitem_6900499a344c8191a3cb1d86ddcb5028,fail,anger,<audio transcript=The tone of the audio is neu...
7,outputitem_6900499a29748191841a9f1f32bd3596,fail,happy,<audio transcript=The tone sounds neutral.\n\n...
8,outputitem_690049998dbc8191a27efc9cae8d3284,fail,anger,<audio transcript=The tone of the audio is neu...
9,outputitem_690049997dc08191820b7742707a0e3e,fail,disgust,<audio transcript=The tone of the audio seems ...
