# GPT Realtime Audio (Azure OpenAI)

This notebook tests the Realtime API (text in, audio out) using Azure OpenAI.

1. Update `.env` with your Azure OpenAI endpoint and deployment name.
2. Run `az login` in a terminal for Entra ID auth.
3. Run the cells below.


In [None]:
%pip install openai[realtime] azure-identity python-dotenv


In [None]:
import os
import base64
import asyncio
from dotenv import load_dotenv
from openai import AsyncOpenAI
from azure.identity import DefaultAzureCredential, get_bearer_token_provider

load_dotenv()

async def main() -> None:
    credential = DefaultAzureCredential()
    token_provider = get_bearer_token_provider(
        credential, "https://cognitiveservices.azure.com/.default"
    )
    token = token_provider()

    endpoint = os.environ["AZURE_OPENAI_ENDPOINT"]
    deployment_name = os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"]

    base_url = endpoint.replace("https://", "wss://").rstrip("/") + "/openai/v1"

    client = AsyncOpenAI(
        websocket_base_url=base_url,
        api_key=token,
    )

    async with client.realtime.connect(model=deployment_name) as connection:
        await connection.session.update(
            session={
                "type": "realtime",
                "instructions": "You are a helpful assistant. You respond by voice and text.",
                "output_modalities": ["audio"],
                "audio": {
                    "input": {
                        "transcription": {"model": "whisper-1"},
                        "format": {"type": "audio/pcm", "rate": 24000},
                        "turn_detection": {
                            "type": "server_vad",
                            "threshold": 0.5,
                            "prefix_padding_ms": 300,
                            "silence_duration_ms": 200,
                            "create_response": True,
                        },
                    },
                    "output": {
                        "voice": "alloy",
                        "format": {"type": "audio/pcm", "rate": 24000},
                    },
                },
            }
        )

        while True:
            user_input = input("Enter a message (q to quit): ")
            if user_input.strip().lower() == "q":
                print("Stopping the conversation.")
                break

            await connection.conversation.item.create(
                item={
                    "type": "message",
                    "role": "user",
                    "content": [{"type": "input_text", "text": user_input}],
                }
            )
            await connection.response.create()

            async for event in connection:
                if event.type == "response.output_text.delta":
                    print(event.delta, flush=True, end="")
                elif event.type == "session.created":
                    print(f"Session ID: {event.session.id}")
                elif event.type == "response.output_audio.delta":
                    audio_data = base64.b64decode(event.delta)
                    print(f"\nReceived {len(audio_data)} bytes of audio data.")
                elif event.type == "response.output_audio_transcript.delta":
                    print(f"Received text delta: {event.delta}")
                elif event.type == "response.output_text.done":
                    print()
                elif event.type == "error":
                    print("Received an error event.")
                    print(f"Error code: {event.error.code}")
                    print(f"Error Event ID: {event.error.event_id}")
                    print(f"Error message: {event.error.message}")
                elif event.type == "response.done":
                    break

    credential.close()

await main()
