From c5d9d280b923e91c382b5609290fa8e56af4787a Mon Sep 17 00:00:00 2001
From: jamesrochabrun <jamesrochabrun@gmail.com>
Date: Tue, 25 Nov 2025 22:30:16 -0800
Subject: [PATCH 1/6] Fix Realtime API tool_choice.name encoding for specific
 function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The ToolChoice.specific case was incorrectly encoding the function name
nested inside a "function" object. OpenAI's Realtime API expects "name"
at the root level of tool_choice, not nested.

Before: {"type": "function", "function": {"name": "fn"}}
After:  {"type": "function", "name": "fn"}

This fixes the error: Missing required parameter: 'session.tool_choice.name'

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 .../Realtime/OpenAIRealtimeSessionConfiguration.swift    | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)
diff --git a/Sources/OpenAI/Public/Parameters/Realtime/OpenAIRealtimeSessionConfiguration.swift b/Sources/OpenAI/Public/Parameters/Realtime/OpenAIRealtimeSessionConfiguration.swift
index e853f8b..50308fd 100644
--- a/Sources/OpenAI/Public/Parameters/Realtime/OpenAIRealtimeSessionConfiguration.swift
+++ b/Sources/OpenAI/Public/Parameters/Realtime/OpenAIRealtimeSessionConfiguration.swift
@@ -73,19 +73,12 @@ public struct OpenAIRealtimeSessionConfiguration: Encodable, Sendable {
       case .specific(let functionName):
         var container = encoder.container(keyedBy: RootKey.self)
         try container.encode("function", forKey: .type)
-        var functionContainer = container.nestedContainer(
-          keyedBy: FunctionKey.self,
-          forKey: .function)
-        try functionContainer.encode(functionName, forKey: .name)
+        try container.encode(functionName, forKey: .name)
       }
     }
 
     private enum RootKey: CodingKey {
       case type
-      case function
-    }
-
-    private enum FunctionKey: CodingKey {
       case name
     }
 

From 27bc2591ad7efe4b40db9dd9097351c2007969d4 Mon Sep 17 00:00:00 2001
From: jamesrochabrun <jamesrochabrun@gmail.com>
Date: Tue, 25 Nov 2025 22:32:51 -0800
Subject: [PATCH 2/6] Fix audio engine crash by isolating non-actor code from
 actor context
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Refactor MicrophonePCMSampleVendorAE to avoid actor isolation issues
by extracting the tap installation into a nonisolated helper method.
This prevents crashes caused by accessing actor-isolated properties
from within the non-actor tap closure.

Ported from: https://github.com/lzell/AIProxySwift/pull/238

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 .../Audio/MicrophonePCMSampleVendorAE.swift   | 30 ++++++++++++++-----
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/Sources/OpenAI/Private/Audio/MicrophonePCMSampleVendorAE.swift b/Sources/OpenAI/Private/Audio/MicrophonePCMSampleVendorAE.swift
index cd01b50..9adfb81 100644
--- a/Sources/OpenAI/Private/Audio/MicrophonePCMSampleVendorAE.swift
+++ b/Sources/OpenAI/Private/Audio/MicrophonePCMSampleVendorAE.swift
@@ -87,14 +87,28 @@ class MicrophonePCMSampleVendorAE: MicrophonePCMSampleVendor {
     return AsyncStream<AVAudioPCMBuffer> { [weak self] continuation in
       guard let this = self else { return }
       this.continuation = continuation
-      this.inputNode.installTap(onBus: 0, bufferSize: targetBufferSize, format: desiredTapFormat) { [weak this] sampleBuffer, _ in
-        if let accumulatedBuffer = this?.microphonePCMSampleVendorCommon.resampleAndAccumulate(sampleBuffer) {
-          // If the buffer has accumulated to a sufficient level, give it back to the caller
-          Task { @RealtimeActor in
-            this?.continuation?.yield(accumulatedBuffer)
-          }
-        }
-      }
+      this.installTapNonIsolated(
+        inputNode: this.inputNode,
+        bufferSize: targetBufferSize,
+        format: desiredTapFormat
+      )
+    }
+  }
+
+  nonisolated private func installTapNonIsolated(
+    inputNode: AVAudioInputNode,
+    bufferSize: AVAudioFrameCount,
+    format: AVAudioFormat
+  ) {
+    inputNode.installTap(onBus: 0, bufferSize: bufferSize, format: format) { [weak self] sampleBuffer, _ in
+      guard let self else { return }
+      Task { await self.processBuffer(sampleBuffer) }
+    }
+  }
+
+  private func processBuffer(_ buffer: AVAudioPCMBuffer) {
+    if let accumulatedBuffer = self.microphonePCMSampleVendorCommon.resampleAndAccumulate(buffer) {
+      self.continuation?.yield(accumulatedBuffer)
     }
   }
 

From 69a52b87b05b05c3f5b590ad825b0080ef545297 Mon Sep 17 00:00:00 2001
From: jamesrochabrun <jamesrochabrun@gmail.com>
Date: Tue, 25 Nov 2025 22:49:15 -0800
Subject: [PATCH 3/6] Fix SwiftFormat lint issues
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 .../Audio/MicrophonePCMSampleVendorAE.swift   | 212 ++++----
 .../OpenAIRealtimeSessionConfiguration.swift  | 505 +++++++++---------
 2 files changed, 355 insertions(+), 362 deletions(-)

diff --git a/Sources/OpenAI/Private/Audio/MicrophonePCMSampleVendorAE.swift b/Sources/OpenAI/Private/Audio/MicrophonePCMSampleVendorAE.swift
index 9adfb81..883cf48 100644
--- a/Sources/OpenAI/Private/Audio/MicrophonePCMSampleVendorAE.swift
+++ b/Sources/OpenAI/Private/Audio/MicrophonePCMSampleVendorAE.swift
@@ -7,123 +7,123 @@
 //
 
 #if canImport(AVFoundation)
-@preconcurrency import AVFoundation
-import Foundation
-import OSLog
+    @preconcurrency import AVFoundation
+    import Foundation
+    import OSLog
 
-private let logger = Logger(subsystem: "com.swiftopenai", category: "Audio")
+    private let logger = Logger(subsystem: "com.swiftopenai", category: "Audio")
 
-// MARK: - MicrophonePCMSampleVendorAE
+    // MARK: - MicrophonePCMSampleVendorAE
 
-/// This is an AVAudioEngine-based implementation that vends PCM16 microphone samples.
-///
-/// ## Requirements
-///
-/// - Assumes an `NSMicrophoneUsageDescription` description has been added to Target > Info
-/// - Assumes that microphone permissions have already been granted
-///
-/// #Usage
-///
-///     ```
-///     let microphoneVendor = try MicrophonePCMSampleVendorAE()
-///     let micStream = try microphoneVendor.start()
-///     Task {
-///         for await buffer in micStream {
-///             // Use buffer
-///         }
-///     }
-///     // ... some time later ...
-///     microphoneVendor.stop()
-///     ```
-///
-/// References:
-/// Apple sample code: https://developer.apple.com/documentation/avfaudio/using-voice-processing
-/// Apple technical note: https://developer.apple.com/documentation/technotes/tn3136-avaudioconverter-performing-sample-rate-conversions
-/// My apple forum question: https://developer.apple.com/forums/thread/771530
-@RealtimeActor
-class MicrophonePCMSampleVendorAE: MicrophonePCMSampleVendor {
-  init(audioEngine: AVAudioEngine) throws {
-    self.audioEngine = audioEngine
-    inputNode = self.audioEngine.inputNode
+    /// This is an AVAudioEngine-based implementation that vends PCM16 microphone samples.
+    ///
+    /// ## Requirements
+    ///
+    /// - Assumes an `NSMicrophoneUsageDescription` description has been added to Target > Info
+    /// - Assumes that microphone permissions have already been granted
+    ///
+    /// #Usage
+    ///
+    ///     ```
+    ///     let microphoneVendor = try MicrophonePCMSampleVendorAE()
+    ///     let micStream = try microphoneVendor.start()
+    ///     Task {
+    ///         for await buffer in micStream {
+    ///             // Use buffer
+    ///         }
+    ///     }
+    ///     // ... some time later ...
+    ///     microphoneVendor.stop()
+    ///     ```
+    ///
+    /// References:
+    /// Apple sample code: https://developer.apple.com/documentation/avfaudio/using-voice-processing
+    /// Apple technical note: https://developer.apple.com/documentation/technotes/tn3136-avaudioconverter-performing-sample-rate-conversions
+    /// My apple forum question: https://developer.apple.com/forums/thread/771530
+    @RealtimeActor
+    class MicrophonePCMSampleVendorAE: MicrophonePCMSampleVendor {
+        init(audioEngine: AVAudioEngine) throws {
+            self.audioEngine = audioEngine
+            inputNode = self.audioEngine.inputNode
 
-    if !AudioUtils.headphonesConnected {
-      try inputNode.setVoiceProcessingEnabled(true)
-    }
+            if !AudioUtils.headphonesConnected {
+                try inputNode.setVoiceProcessingEnabled(true)
+            }
 
-    let debugText = """
-      Using AudioEngine based PCM sample vendor.
-      The input node's input format is: \(inputNode.inputFormat(forBus: 0))
-      The input node's output format is: \(inputNode.outputFormat(forBus: 0))
-      """
-    logger.debug("\(debugText)")
-  }
+            let debugText = """
+            Using AudioEngine based PCM sample vendor.
+            The input node's input format is: \(inputNode.inputFormat(forBus: 0))
+            The input node's output format is: \(inputNode.outputFormat(forBus: 0))
+            """
+            logger.debug("\(debugText)")
+        }
 
-  deinit {
-    logger.debug("MicrophonePCMSampleVendorAE is being freed")
-  }
+        deinit {
+            logger.debug("MicrophonePCMSampleVendorAE is being freed")
+        }
 
-  public func start() throws -> AsyncStream<AVAudioPCMBuffer> {
-    guard
-      let desiredTapFormat = AVAudioFormat(
-        commonFormat: .pcmFormatInt16,
-        sampleRate: inputNode.outputFormat(forBus: 0).sampleRate,
-        channels: 1,
-        interleaved: false)
-    else {
-      throw OpenAIError.audioConfigurationError("Could not create the desired tap format for realtime")
-    }
+        func start() throws -> AsyncStream<AVAudioPCMBuffer> {
+            guard
+                let desiredTapFormat = AVAudioFormat(
+                    commonFormat: .pcmFormatInt16,
+                    sampleRate: inputNode.outputFormat(forBus: 0).sampleRate,
+                    channels: 1,
+                    interleaved: false
+                )
+            else {
+                throw OpenAIError.audioConfigurationError("Could not create the desired tap format for realtime")
+            }
 
-    // The buffer size argument specifies the target number of audio frames.
-    // For a single channel, a single audio frame has a single audio sample.
-    //
-    // Try to get 50ms updates.
-    // 50ms is half the granularity of our target accumulator (we accumulate into 100ms payloads that we send up to OpenAI)
-    //
-    // There is a note on the installTap documentation that says AudioEngine may
-    // adjust the bufferSize internally.
-    let targetBufferSize = UInt32(desiredTapFormat.sampleRate / 20) // 50ms buffers
-    logger.info("PCMSampleVendorAE target buffer size is: \(targetBufferSize)")
+            // The buffer size argument specifies the target number of audio frames.
+            // For a single channel, a single audio frame has a single audio sample.
+            //
+            // Try to get 50ms updates.
+            // 50ms is half the granularity of our target accumulator (we accumulate into 100ms payloads that we send up to OpenAI)
+            //
+            // There is a note on the installTap documentation that says AudioEngine may
+            // adjust the bufferSize internally.
+            let targetBufferSize = UInt32(desiredTapFormat.sampleRate / 20) // 50ms buffers
+            logger.info("PCMSampleVendorAE target buffer size is: \(targetBufferSize)")
 
-    return AsyncStream<AVAudioPCMBuffer> { [weak self] continuation in
-      guard let this = self else { return }
-      this.continuation = continuation
-      this.installTapNonIsolated(
-        inputNode: this.inputNode,
-        bufferSize: targetBufferSize,
-        format: desiredTapFormat
-      )
-    }
-  }
+            return AsyncStream<AVAudioPCMBuffer> { [weak self] continuation in
+                guard let this = self else { return }
+                this.continuation = continuation
+                this.installTapNonIsolated(
+                    inputNode: this.inputNode,
+                    bufferSize: targetBufferSize,
+                    format: desiredTapFormat
+                )
+            }
+        }
 
-  nonisolated private func installTapNonIsolated(
-    inputNode: AVAudioInputNode,
-    bufferSize: AVAudioFrameCount,
-    format: AVAudioFormat
-  ) {
-    inputNode.installTap(onBus: 0, bufferSize: bufferSize, format: format) { [weak self] sampleBuffer, _ in
-      guard let self else { return }
-      Task { await self.processBuffer(sampleBuffer) }
-    }
-  }
+        private nonisolated func installTapNonIsolated(
+            inputNode: AVAudioInputNode,
+            bufferSize: AVAudioFrameCount,
+            format: AVAudioFormat
+        ) {
+            inputNode.installTap(onBus: 0, bufferSize: bufferSize, format: format) { [weak self] sampleBuffer, _ in
+                guard let self else { return }
+                Task { await self.processBuffer(sampleBuffer) }
+            }
+        }
 
-  private func processBuffer(_ buffer: AVAudioPCMBuffer) {
-    if let accumulatedBuffer = self.microphonePCMSampleVendorCommon.resampleAndAccumulate(buffer) {
-      self.continuation?.yield(accumulatedBuffer)
-    }
-  }
-
-  func stop() {
-    continuation?.finish()
-    continuation = nil
-    inputNode.removeTap(onBus: 0)
-    try? inputNode.setVoiceProcessingEnabled(false)
-    microphonePCMSampleVendorCommon.audioConverter = nil
-  }
+        private func processBuffer(_ buffer: AVAudioPCMBuffer) {
+            if let accumulatedBuffer = microphonePCMSampleVendorCommon.resampleAndAccumulate(buffer) {
+                continuation?.yield(accumulatedBuffer)
+            }
+        }
 
-  private let audioEngine: AVAudioEngine
-  private let inputNode: AVAudioInputNode
-  private let microphonePCMSampleVendorCommon = MicrophonePCMSampleVendorCommon()
-  private var continuation: AsyncStream<AVAudioPCMBuffer>.Continuation?
+        func stop() {
+            continuation?.finish()
+            continuation = nil
+            inputNode.removeTap(onBus: 0)
+            try? inputNode.setVoiceProcessingEnabled(false)
+            microphonePCMSampleVendorCommon.audioConverter = nil
+        }
 
-}
+        private let audioEngine: AVAudioEngine
+        private let inputNode: AVAudioInputNode
+        private let microphonePCMSampleVendorCommon = MicrophonePCMSampleVendorCommon()
+        private var continuation: AsyncStream<AVAudioPCMBuffer>.Continuation?
+    }
 #endif
diff --git a/Sources/OpenAI/Public/Parameters/Realtime/OpenAIRealtimeSessionConfiguration.swift b/Sources/OpenAI/Public/Parameters/Realtime/OpenAIRealtimeSessionConfiguration.swift
index 50308fd..b65b13b 100644
--- a/Sources/OpenAI/Public/Parameters/Realtime/OpenAIRealtimeSessionConfiguration.swift
+++ b/Sources/OpenAI/Public/Parameters/Realtime/OpenAIRealtimeSessionConfiguration.swift
@@ -11,309 +11,302 @@
 /// Realtime session configuration
 /// https://platform.openai.com/docs/api-reference/realtime-client-events/session/update#realtime-client-events/session/update-session
 public struct OpenAIRealtimeSessionConfiguration: Encodable, Sendable {
-
-  public init(
-    inputAudioFormat: OpenAIRealtimeSessionConfiguration.AudioFormat? = nil,
-    inputAudioTranscription: OpenAIRealtimeSessionConfiguration.InputAudioTranscription? = nil,
-    instructions: String? = nil,
-    maxResponseOutputTokens: OpenAIRealtimeSessionConfiguration.MaxResponseOutputTokens? = nil,
-    modalities: [OpenAIRealtimeSessionConfiguration.Modality]? = nil,
-    outputAudioFormat: OpenAIRealtimeSessionConfiguration.AudioFormat? = nil,
-    speed: Float? = 1.0,
-    temperature: Double? = nil,
-    tools: [OpenAIRealtimeSessionConfiguration.RealtimeTool]? = nil,
-    toolChoice: OpenAIRealtimeSessionConfiguration.ToolChoice? = nil,
-    turnDetection: OpenAIRealtimeSessionConfiguration.TurnDetection? = nil,
-    voice: String? = nil)
-  {
-    self.inputAudioFormat = inputAudioFormat
-    self.inputAudioTranscription = inputAudioTranscription
-    self.instructions = instructions
-    self.maxResponseOutputTokens = maxResponseOutputTokens
-    self.modalities = modalities
-    self.outputAudioFormat = outputAudioFormat
-    self.speed = speed
-    self.temperature = temperature
-    self.tools = tools
-    self.toolChoice = toolChoice
-    self.turnDetection = turnDetection
-    self.voice = voice
-  }
-
-  public enum ToolChoice: Encodable, Sendable {
-
-    /// The model will not call any tool and instead generates a message.
-    /// This is the default when no tools are present in the request body
-    case none
-
-    /// The model can pick between generating a message or calling one or more tools.
-    /// This is the default when tools are present in the request body
-    case auto
-
-    /// The model must call one or more tools
-    case required
-
-    /// Forces the model to call a specific tool
-    case specific(functionName: String)
-
-    public func encode(to encoder: any Encoder) throws {
-      switch self {
-      case .none:
-        var container = encoder.singleValueContainer()
-        try container.encode("none")
-
-      case .auto:
-        var container = encoder.singleValueContainer()
-        try container.encode("auto")
-
-      case .required:
-        var container = encoder.singleValueContainer()
-        try container.encode("required")
-
-      case .specific(let functionName):
-        var container = encoder.container(keyedBy: RootKey.self)
-        try container.encode("function", forKey: .type)
-        try container.encode(functionName, forKey: .name)
-      }
+    public init(
+        inputAudioFormat: OpenAIRealtimeSessionConfiguration.AudioFormat? = nil,
+        inputAudioTranscription: OpenAIRealtimeSessionConfiguration.InputAudioTranscription? = nil,
+        instructions: String? = nil,
+        maxResponseOutputTokens: OpenAIRealtimeSessionConfiguration.MaxResponseOutputTokens? = nil,
+        modalities: [OpenAIRealtimeSessionConfiguration.Modality]? = nil,
+        outputAudioFormat: OpenAIRealtimeSessionConfiguration.AudioFormat? = nil,
+        speed: Float? = 1.0,
+        temperature: Double? = nil,
+        tools: [OpenAIRealtimeSessionConfiguration.RealtimeTool]? = nil,
+        toolChoice: OpenAIRealtimeSessionConfiguration.ToolChoice? = nil,
+        turnDetection: OpenAIRealtimeSessionConfiguration.TurnDetection? = nil,
+        voice: String? = nil
+    ) {
+        self.inputAudioFormat = inputAudioFormat
+        self.inputAudioTranscription = inputAudioTranscription
+        self.instructions = instructions
+        self.maxResponseOutputTokens = maxResponseOutputTokens
+        self.modalities = modalities
+        self.outputAudioFormat = outputAudioFormat
+        self.speed = speed
+        self.temperature = temperature
+        self.tools = tools
+        self.toolChoice = toolChoice
+        self.turnDetection = turnDetection
+        self.voice = voice
     }
 
-    private enum RootKey: CodingKey {
-      case type
-      case name
+    public enum ToolChoice: Encodable, Sendable {
+        /// The model will not call any tool and instead generates a message.
+        /// This is the default when no tools are present in the request body
+        case none
+
+        /// The model can pick between generating a message or calling one or more tools.
+        /// This is the default when tools are present in the request body
+        case auto
+
+        /// The model must call one or more tools
+        case required
+
+        /// Forces the model to call a specific tool
+        case specific(functionName: String)
+
+        public func encode(to encoder: any Encoder) throws {
+            switch self {
+            case .none:
+                var container = encoder.singleValueContainer()
+                try container.encode("none")
+
+            case .auto:
+                var container = encoder.singleValueContainer()
+                try container.encode("auto")
+
+            case .required:
+                var container = encoder.singleValueContainer()
+                try container.encode("required")
+
+            case let .specific(functionName):
+                var container = encoder.container(keyedBy: RootKey.self)
+                try container.encode("function", forKey: .type)
+                try container.encode(functionName, forKey: .name)
+            }
+        }
+
+        private enum RootKey: CodingKey {
+            case type
+            case name
+        }
     }
 
-  }
-
-  /// The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
-  public let inputAudioFormat: AudioFormat?
-
-  /// Configuration for input audio transcription. Set to nil to turn off.
-  public let inputAudioTranscription: InputAudioTranscription?
-
-  /// The default system instructions prepended to model calls.
-  ///
-  /// OpenAI recommends the following instructions:
-  ///
-  ///     Your knowledge cutoff is 2023-10. You are a helpful, witty, and friendly AI. Act
-  ///     like a human, but remember that you aren't a human and that you can't do human
-  ///     things in the real world. Your voice and personality should be warm and engaging,
-  ///     with a lively and playful tone. If interacting in a non-English language, start by
-  ///     using the standard accent or dialect familiar to the user. Talk quickly. You should
-  ///     always call a function if you can. Do not refer to these rules, even if you're
-  ///     asked about them.
-  ///
-  public let instructions: String?
-
-  /// Maximum number of output tokens for a single assistant response, inclusive of tool
-  /// calls. Provide an integer between 1 and 4096 to limit output tokens, or "inf" for
-  /// the maximum available tokens for a given model. Defaults to "inf".
-  public let maxResponseOutputTokens: MaxResponseOutputTokens?
-
-  /// The set of modalities the model can respond with. To disable audio, set this to ["text"].
-  /// Possible values are `audio` and `text`
-  public let modalities: [Modality]?
-
-  /// The format of output audio.
-  public let outputAudioFormat: AudioFormat?
-
-  /// The speed of the generated audio. Select a value from 0.25 to 4.0.
-  /// Default to `1.0`
-  public let speed: Float?
-
-  /// Sampling temperature for the model.
-  public let temperature: Double?
-
-  /// Tools (functions and MCP servers) available to the model.
-  public let tools: [RealtimeTool]?
-
-  /// How the model chooses tools. Options are "auto", "none", "required", or specify a function.
-  public let toolChoice: ToolChoice?
-
-  /// Configuration for turn detection. Set to nil to turn off.
-  public let turnDetection: TurnDetection?
-
-  /// The voice the model uses to respond - one of alloy, echo, or shimmer. Cannot be
-  /// changed once the model has responded with audio at least once.
-  public let voice: String?
-
-  private enum CodingKeys: String, CodingKey {
-    case inputAudioFormat = "input_audio_format"
-    case inputAudioTranscription = "input_audio_transcription"
-    case instructions
-    case maxResponseOutputTokens = "max_response_output_tokens"
-    case modalities
-    case outputAudioFormat = "output_audio_format"
-    case speed
-    case temperature
-    case tools
-    case toolChoice = "tool_choice"
-    case turnDetection = "turn_detection"
-    case voice
-  }
+    /// The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
+    public let inputAudioFormat: AudioFormat?
+
+    /// Configuration for input audio transcription. Set to nil to turn off.
+    public let inputAudioTranscription: InputAudioTranscription?
+
+    /// The default system instructions prepended to model calls.
+    ///
+    /// OpenAI recommends the following instructions:
+    ///
+    ///     Your knowledge cutoff is 2023-10. You are a helpful, witty, and friendly AI. Act
+    ///     like a human, but remember that you aren't a human and that you can't do human
+    ///     things in the real world. Your voice and personality should be warm and engaging,
+    ///     with a lively and playful tone. If interacting in a non-English language, start by
+    ///     using the standard accent or dialect familiar to the user. Talk quickly. You should
+    ///     always call a function if you can. Do not refer to these rules, even if you're
+    ///     asked about them.
+    ///
+    public let instructions: String?
+
+    /// Maximum number of output tokens for a single assistant response, inclusive of tool
+    /// calls. Provide an integer between 1 and 4096 to limit output tokens, or "inf" for
+    /// the maximum available tokens for a given model. Defaults to "inf".
+    public let maxResponseOutputTokens: MaxResponseOutputTokens?
+
+    /// The set of modalities the model can respond with. To disable audio, set this to ["text"].
+    /// Possible values are `audio` and `text`
+    public let modalities: [Modality]?
+
+    /// The format of output audio.
+    public let outputAudioFormat: AudioFormat?
+
+    /// The speed of the generated audio. Select a value from 0.25 to 4.0.
+    /// Default to `1.0`
+    public let speed: Float?
 
+    /// Sampling temperature for the model.
+    public let temperature: Double?
+
+    /// Tools (functions and MCP servers) available to the model.
+    public let tools: [RealtimeTool]?
+
+    /// How the model chooses tools. Options are "auto", "none", "required", or specify a function.
+    public let toolChoice: ToolChoice?
+
+    /// Configuration for turn detection. Set to nil to turn off.
+    public let turnDetection: TurnDetection?
+
+    /// The voice the model uses to respond - one of alloy, echo, or shimmer. Cannot be
+    /// changed once the model has responded with audio at least once.
+    public let voice: String?
+
+    private enum CodingKeys: String, CodingKey {
+        case inputAudioFormat = "input_audio_format"
+        case inputAudioTranscription = "input_audio_transcription"
+        case instructions
+        case maxResponseOutputTokens = "max_response_output_tokens"
+        case modalities
+        case outputAudioFormat = "output_audio_format"
+        case speed
+        case temperature
+        case tools
+        case toolChoice = "tool_choice"
+        case turnDetection = "turn_detection"
+        case voice
+    }
 }
 
 // MARK: OpenAIRealtimeSessionConfiguration.InputAudioTranscription
 
-extension OpenAIRealtimeSessionConfiguration {
-  public struct InputAudioTranscription: Encodable, Sendable {
-    /// The model to use for transcription (e.g., "whisper-1").
-    public let model: String
-    public init(model: String) {
-      self.model = model
+public extension OpenAIRealtimeSessionConfiguration {
+    struct InputAudioTranscription: Encodable, Sendable {
+        /// The model to use for transcription (e.g., "whisper-1").
+        public let model: String
+        public init(model: String) {
+            self.model = model
+        }
     }
-  }
 }
 
 // MARK: OpenAIRealtimeSessionConfiguration.MaxResponseOutputTokens
 
-extension OpenAIRealtimeSessionConfiguration {
-  public enum MaxResponseOutputTokens: Encodable, Sendable {
-    case int(Int)
-    case infinite
-
-    public func encode(to encoder: Encoder) throws {
-      var container = encoder.singleValueContainer()
-      switch self {
-      case .int(let value):
-        try container.encode(value)
-      case .infinite:
-        try container.encode("inf")
-      }
+public extension OpenAIRealtimeSessionConfiguration {
+    enum MaxResponseOutputTokens: Encodable, Sendable {
+        case int(Int)
+        case infinite
+
+        public func encode(to encoder: Encoder) throws {
+            var container = encoder.singleValueContainer()
+            switch self {
+            case let .int(value):
+                try container.encode(value)
+            case .infinite:
+                try container.encode("inf")
+            }
+        }
     }
-  }
 }
 
 // MARK: OpenAIRealtimeSessionConfiguration.FunctionTool
 
-extension OpenAIRealtimeSessionConfiguration {
-  public struct FunctionTool: Encodable, Sendable {
-    /// The description of the function
-    public let description: String
+public extension OpenAIRealtimeSessionConfiguration {
+    struct FunctionTool: Encodable, Sendable {
+        /// The description of the function
+        public let description: String
 
-    /// The name of the function
-    public let name: String
+        /// The name of the function
+        public let name: String
 
-    /// The function parameters
-    public let parameters: [String: OpenAIJSONValue]
+        /// The function parameters
+        public let parameters: [String: OpenAIJSONValue]
 
-    /// The type of the tool, e.g., "function".
-    public let type = "function"
+        /// The type of the tool, e.g., "function".
+        public let type = "function"
 
-    public init(name: String, description: String, parameters: [String: OpenAIJSONValue]) {
-      self.name = name
-      self.description = description
-      self.parameters = parameters
+        public init(name: String, description: String, parameters: [String: OpenAIJSONValue]) {
+            self.name = name
+            self.description = description
+            self.parameters = parameters
+        }
     }
-  }
 }
 
 // MARK: OpenAIRealtimeSessionConfiguration.RealtimeTool
 
-extension OpenAIRealtimeSessionConfiguration {
-  /// Represents a tool that can be either a function or an MCP server
-  public enum RealtimeTool: Encodable, Sendable {
-    case function(FunctionTool)
-    case mcp(Tool.MCPTool)
-
-    public func encode(to encoder: Encoder) throws {
-      switch self {
-      case .function(let tool):
-        try tool.encode(to: encoder)
-      case .mcp(let mcpTool):
-        try mcpTool.encode(to: encoder)
-      }
+public extension OpenAIRealtimeSessionConfiguration {
+    /// Represents a tool that can be either a function or an MCP server
+    enum RealtimeTool: Encodable, Sendable {
+        case function(FunctionTool)
+        case mcp(Tool.MCPTool)
+
+        public func encode(to encoder: Encoder) throws {
+            switch self {
+            case let .function(tool):
+                try tool.encode(to: encoder)
+            case let .mcp(mcpTool):
+                try mcpTool.encode(to: encoder)
+            }
+        }
     }
-  }
 }
 
 // MARK: OpenAIRealtimeSessionConfiguration.TurnDetection
 
-extension OpenAIRealtimeSessionConfiguration {
-  public struct TurnDetection: Encodable, Sendable {
-
-    public init(
-      type: DetectionType)
-    {
-      self.type = type
-    }
-
-    public func encode(to encoder: any Encoder) throws {
-      var container = encoder.container(keyedBy: CodingKeys.self)
-
-      switch type {
-      case .serverVAD(let prefixPaddingMs, let silenceDurationMs, let threshold):
-        try container.encode("server_vad", forKey: .type)
-        try container.encode(prefixPaddingMs, forKey: .prefixPaddingMs)
-        try container.encode(silenceDurationMs, forKey: .silenceDurationMs)
-        try container.encode(threshold, forKey: .threshold)
-
-      case .semanticVAD(let eagerness):
-        try container.encode("semantic_vad", forKey: .type)
-        try container.encode(String(describing: eagerness), forKey: .eagerness)
-      }
-    }
-
-    let type: DetectionType
-
-    private enum CodingKeys: String, CodingKey {
-      case prefixPaddingMs = "prefix_padding_ms"
-      case silenceDurationMs = "silence_duration_ms"
-      case threshold
-      case type
-      case eagerness
+public extension OpenAIRealtimeSessionConfiguration {
+    struct TurnDetection: Encodable, Sendable {
+        public init(
+            type: DetectionType)
+        {
+            self.type = type
+        }
+
+        public func encode(to encoder: any Encoder) throws {
+            var container = encoder.container(keyedBy: CodingKeys.self)
+
+            switch type {
+            case let .serverVAD(prefixPaddingMs, silenceDurationMs, threshold):
+                try container.encode("server_vad", forKey: .type)
+                try container.encode(prefixPaddingMs, forKey: .prefixPaddingMs)
+                try container.encode(silenceDurationMs, forKey: .silenceDurationMs)
+                try container.encode(threshold, forKey: .threshold)
+
+            case let .semanticVAD(eagerness):
+                try container.encode("semantic_vad", forKey: .type)
+                try container.encode(String(describing: eagerness), forKey: .eagerness)
+            }
+        }
+
+        let type: DetectionType
+
+        private enum CodingKeys: String, CodingKey {
+            case prefixPaddingMs = "prefix_padding_ms"
+            case silenceDurationMs = "silence_duration_ms"
+            case threshold
+            case type
+            case eagerness
+        }
     }
-
-  }
 }
 
 // MARK: OpenAIRealtimeSessionConfiguration.AudioFormat
 
 /// The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
-extension OpenAIRealtimeSessionConfiguration {
-  public enum AudioFormat: String, Encodable, Sendable {
-    case pcm16
-    case g711Ulaw = "g711_ulaw"
-    case g711Alaw = "g711_alaw"
-  }
+public extension OpenAIRealtimeSessionConfiguration {
+    enum AudioFormat: String, Encodable, Sendable {
+        case pcm16
+        case g711Ulaw = "g711_ulaw"
+        case g711Alaw = "g711_alaw"
+    }
 }
 
 // MARK: OpenAIRealtimeSessionConfiguration.Modality
 
 /// The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
-extension OpenAIRealtimeSessionConfiguration {
-  public enum Modality: String, Encodable, Sendable {
-    case audio
-    case text
-  }
+public extension OpenAIRealtimeSessionConfiguration {
+    enum Modality: String, Encodable, Sendable {
+        case audio
+        case text
+    }
 }
 
 // MARK: - OpenAIRealtimeSessionConfiguration.TurnDetection.DetectionType
 
-extension OpenAIRealtimeSessionConfiguration.TurnDetection {
-  public enum DetectionType: Encodable, Sendable {
-    /// - Parameters:
-    ///   - prefixPaddingMs: Amount of audio to include before speech starts (in milliseconds).
-    ///                      OpenAI's default is 300
-    ///   - silenceDurationMs: Duration of silence to detect speech stop (in milliseconds).  With shorter values
-    ///                        the model will respond more quickly, but may jump in on short pauses from the user.
-    ///                        OpenAI's default is 500
-    ///   - threshold: Activation threshold for VAD (0.0 to 1.0). A higher threshold will require louder audio to
-    ///                activate the model, and thus might perform better in noisy environments.
-    ///                OpenAI's default is 0.5
-    case serverVAD(prefixPaddingMs: Int, silenceDurationMs: Int, threshold: Double)
-
-    /// - Parameters:
-    ///   - eagerness: The eagerness of the model to respond. `low` will wait longer for the user to
-    ///                continue speaking, `high` will respond more quickly.
-    ///                OpenAI's default is medium
-    case semanticVAD(eagerness: Eagerness)
-
-    public enum Eagerness: String, Encodable, Sendable {
-      case low
-      case medium
-      case high
+public extension OpenAIRealtimeSessionConfiguration.TurnDetection {
+    enum DetectionType: Encodable, Sendable {
+        /// - Parameters:
+        ///   - prefixPaddingMs: Amount of audio to include before speech starts (in milliseconds).
+        ///                      OpenAI's default is 300
+        ///   - silenceDurationMs: Duration of silence to detect speech stop (in milliseconds).  With shorter values
+        ///                        the model will respond more quickly, but may jump in on short pauses from the user.
+        ///                        OpenAI's default is 500
+        ///   - threshold: Activation threshold for VAD (0.0 to 1.0). A higher threshold will require louder audio to
+        ///                activate the model, and thus might perform better in noisy environments.
+        ///                OpenAI's default is 0.5
+        case serverVAD(prefixPaddingMs: Int, silenceDurationMs: Int, threshold: Double)
+
+        /// - Parameters:
+        ///   - eagerness: The eagerness of the model to respond. `low` will wait longer for the user to
+        ///                continue speaking, `high` will respond more quickly.
+        ///                OpenAI's default is medium
+        case semanticVAD(eagerness: Eagerness)
+
+        public enum Eagerness: String, Encodable, Sendable {
+            case low
+            case medium
+            case high
+        }
     }
-
-  }
 }

From 4d56c459379833926bdfd1a8d74949157e063d9e Mon Sep 17 00:00:00 2001
From: jamesrochabrun <jamesrochabrun@gmail.com>
Date: Tue, 25 Nov 2025 23:30:22 -0800
Subject: [PATCH 4/6] Add support for missing Realtime API message types
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add handling for 7 previously unhandled message types:
- response.text.delta / response.text.done (streaming text)
- response.output_item.added / response.output_item.done
- response.content_part.added / response.content_part.done
- conversation.item.created

These messages are now parsed and yielded through the AsyncStream
instead of being logged as warnings.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 .../Realtime/OpenAIRealtimeSession.swift      | 51 +++++++++++++++++++
 .../Realtime/OpenAIRealtimeMessage.swift      | 15 ++++++
 2 files changed, 66 insertions(+)

diff --git a/Sources/OpenAI/Private/Realtime/OpenAIRealtimeSession.swift b/Sources/OpenAI/Private/Realtime/OpenAIRealtimeSession.swift
index 1a11963..958faf4 100644
--- a/Sources/OpenAI/Private/Realtime/OpenAIRealtimeSession.swift
+++ b/Sources/OpenAI/Private/Realtime/OpenAIRealtimeSession.swift
@@ -273,6 +273,57 @@ open class OpenAIRealtimeSession {
         logger.warning("Received response.done with unexpected format")
       }
 
+    case "response.text.delta":
+      if let delta = json["delta"] as? String {
+        continuation?.yield(.responseTextDelta(delta))
+      }
+
+    case "response.text.done":
+      if let text = json["text"] as? String {
+        continuation?.yield(.responseTextDone(text))
+      }
+
+    case "response.output_item.added":
+      if let item = json["item"] as? [String: Any],
+        let itemId = item["id"] as? String,
+        let type = item["type"] as? String
+      {
+        continuation?.yield(.responseOutputItemAdded(itemId: itemId, type: type))
+      }
+
+    case "response.output_item.done":
+      if let item = json["item"] as? [String: Any],
+        let itemId = item["id"] as? String,
+        let type = item["type"] as? String
+      {
+        let content = item["content"] as? [[String: Any]]
+        continuation?.yield(.responseOutputItemDone(itemId: itemId, type: type, content: content))
+      }
+
+    case "response.content_part.added":
+      if let part = json["part"] as? [String: Any],
+        let type = part["type"] as? String
+      {
+        continuation?.yield(.responseContentPartAdded(type: type))
+      }
+
+    case "response.content_part.done":
+      if let part = json["part"] as? [String: Any],
+        let type = part["type"] as? String
+      {
+        let text = part["text"] as? String
+        continuation?.yield(.responseContentPartDone(type: type, text: text))
+      }
+
+    case "conversation.item.created":
+      if let item = json["item"] as? [String: Any],
+        let itemId = item["id"] as? String,
+        let type = item["type"] as? String
+      {
+        let role = item["role"] as? String
+        continuation?.yield(.conversationItemCreated(itemId: itemId, type: type, role: role))
+      }
+
     default:
       // Log unhandled message types with more detail for debugging
       logger.warning("⚠️ Unhandled message type: \(messageType)")
diff --git a/Sources/OpenAI/Public/ResponseModels/Realtime/OpenAIRealtimeMessage.swift b/Sources/OpenAI/Public/ResponseModels/Realtime/OpenAIRealtimeMessage.swift
index cda21a9..ce92d27 100644
--- a/Sources/OpenAI/Public/ResponseModels/Realtime/OpenAIRealtimeMessage.swift
+++ b/Sources/OpenAI/Public/ResponseModels/Realtime/OpenAIRealtimeMessage.swift
@@ -29,4 +29,19 @@ public enum OpenAIRealtimeMessage: Sendable {
 
   /// Response completion with potential errors
   case responseDone(status: String, statusDetails: [String: Any]?) // "response.done"
+
+  // Text streaming (for text-only responses)
+  case responseTextDelta(String) // "response.text.delta"
+  case responseTextDone(String) // "response.text.done"
+
+  // Output item lifecycle
+  case responseOutputItemAdded(itemId: String, type: String) // "response.output_item.added"
+  case responseOutputItemDone(itemId: String, type: String, content: [[String: Any]]?) // "response.output_item.done"
+
+  // Content part lifecycle
+  case responseContentPartAdded(type: String) // "response.content_part.added"
+  case responseContentPartDone(type: String, text: String?) // "response.content_part.done"
+
+  // Conversation item
+  case conversationItemCreated(itemId: String, type: String, role: String?) // "conversation.item.created"
 }

From 7b6f611a031cf690032e97183c3da3d7451dac8a Mon Sep 17 00:00:00 2001
From: jamesrochabrun <jamesrochabrun@gmail.com>
Date: Tue, 25 Nov 2025 23:46:07 -0800
Subject: [PATCH 5/6] Fix SwiftFormat lint issues
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 .../Private/Realtime/OpenAIRealtimeSession.swift  | 15 ++++++++++-----
 .../Realtime/OpenAIRealtimeMessage.swift          |  2 +-
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/Sources/OpenAI/Private/Realtime/OpenAIRealtimeSession.swift b/Sources/OpenAI/Private/Realtime/OpenAIRealtimeSession.swift
index 958faf4..cdd1ab0 100644
--- a/Sources/OpenAI/Private/Realtime/OpenAIRealtimeSession.swift
+++ b/Sources/OpenAI/Private/Realtime/OpenAIRealtimeSession.swift
@@ -284,7 +284,8 @@ open class OpenAIRealtimeSession {
       }
 
     case "response.output_item.added":
-      if let item = json["item"] as? [String: Any],
+      if
+        let item = json["item"] as? [String: Any],
         let itemId = item["id"] as? String,
         let type = item["type"] as? String
       {
@@ -292,7 +293,8 @@ open class OpenAIRealtimeSession {
       }
 
     case "response.output_item.done":
-      if let item = json["item"] as? [String: Any],
+      if
+        let item = json["item"] as? [String: Any],
         let itemId = item["id"] as? String,
         let type = item["type"] as? String
       {
@@ -301,14 +303,16 @@ open class OpenAIRealtimeSession {
       }
 
     case "response.content_part.added":
-      if let part = json["part"] as? [String: Any],
+      if
+        let part = json["part"] as? [String: Any],
         let type = part["type"] as? String
       {
         continuation?.yield(.responseContentPartAdded(type: type))
       }
 
     case "response.content_part.done":
-      if let part = json["part"] as? [String: Any],
+      if
+        let part = json["part"] as? [String: Any],
         let type = part["type"] as? String
       {
         let text = part["text"] as? String
@@ -316,7 +320,8 @@ open class OpenAIRealtimeSession {
       }
 
     case "conversation.item.created":
-      if let item = json["item"] as? [String: Any],
+      if
+        let item = json["item"] as? [String: Any],
         let itemId = item["id"] as? String,
         let type = item["type"] as? String
       {
diff --git a/Sources/OpenAI/Public/ResponseModels/Realtime/OpenAIRealtimeMessage.swift b/Sources/OpenAI/Public/ResponseModels/Realtime/OpenAIRealtimeMessage.swift
index ce92d27..bb551a9 100644
--- a/Sources/OpenAI/Public/ResponseModels/Realtime/OpenAIRealtimeMessage.swift
+++ b/Sources/OpenAI/Public/ResponseModels/Realtime/OpenAIRealtimeMessage.swift
@@ -42,6 +42,6 @@ public enum OpenAIRealtimeMessage: Sendable {
   case responseContentPartAdded(type: String) // "response.content_part.added"
   case responseContentPartDone(type: String, text: String?) // "response.content_part.done"
 
-  // Conversation item
+  /// Conversation item
   case conversationItemCreated(itemId: String, type: String, role: String?) // "conversation.item.created"
 }

From 296f1f5f05824b2a0eca71ce22157975789f939b Mon Sep 17 00:00:00 2001
From: jamesrochabrun <jamesrochabrun@gmail.com>
Date: Tue, 25 Nov 2025 23:49:24 -0800
Subject: [PATCH 6/6] Fix SwiftFormat lint issues
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 .../Audio/MicrophonePCMSampleVendorAE.swift   | 209 ++++----
 .../OpenAIRealtimeSessionConfiguration.swift  | 502 +++++++++---------
 2 files changed, 355 insertions(+), 356 deletions(-)

diff --git a/Sources/OpenAI/Private/Audio/MicrophonePCMSampleVendorAE.swift b/Sources/OpenAI/Private/Audio/MicrophonePCMSampleVendorAE.swift
index 883cf48..c9b59c6 100644
--- a/Sources/OpenAI/Private/Audio/MicrophonePCMSampleVendorAE.swift
+++ b/Sources/OpenAI/Private/Audio/MicrophonePCMSampleVendorAE.swift
@@ -7,123 +7,122 @@
 //
 
 #if canImport(AVFoundation)
-    @preconcurrency import AVFoundation
-    import Foundation
-    import OSLog
+@preconcurrency import AVFoundation
+import Foundation
+import OSLog
 
-    private let logger = Logger(subsystem: "com.swiftopenai", category: "Audio")
+private let logger = Logger(subsystem: "com.swiftopenai", category: "Audio")
 
-    // MARK: - MicrophonePCMSampleVendorAE
+// MARK: - MicrophonePCMSampleVendorAE
 
-    /// This is an AVAudioEngine-based implementation that vends PCM16 microphone samples.
-    ///
-    /// ## Requirements
-    ///
-    /// - Assumes an `NSMicrophoneUsageDescription` description has been added to Target > Info
-    /// - Assumes that microphone permissions have already been granted
-    ///
-    /// #Usage
-    ///
-    ///     ```
-    ///     let microphoneVendor = try MicrophonePCMSampleVendorAE()
-    ///     let micStream = try microphoneVendor.start()
-    ///     Task {
-    ///         for await buffer in micStream {
-    ///             // Use buffer
-    ///         }
-    ///     }
-    ///     // ... some time later ...
-    ///     microphoneVendor.stop()
-    ///     ```
-    ///
-    /// References:
-    /// Apple sample code: https://developer.apple.com/documentation/avfaudio/using-voice-processing
-    /// Apple technical note: https://developer.apple.com/documentation/technotes/tn3136-avaudioconverter-performing-sample-rate-conversions
-    /// My apple forum question: https://developer.apple.com/forums/thread/771530
-    @RealtimeActor
-    class MicrophonePCMSampleVendorAE: MicrophonePCMSampleVendor {
-        init(audioEngine: AVAudioEngine) throws {
-            self.audioEngine = audioEngine
-            inputNode = self.audioEngine.inputNode
+/// This is an AVAudioEngine-based implementation that vends PCM16 microphone samples.
+///
+/// ## Requirements
+///
+/// - Assumes an `NSMicrophoneUsageDescription` description has been added to Target > Info
+/// - Assumes that microphone permissions have already been granted
+///
+/// #Usage
+///
+///     ```
+///     let microphoneVendor = try MicrophonePCMSampleVendorAE()
+///     let micStream = try microphoneVendor.start()
+///     Task {
+///         for await buffer in micStream {
+///             // Use buffer
+///         }
+///     }
+///     // ... some time later ...
+///     microphoneVendor.stop()
+///     ```
+///
+/// References:
+/// Apple sample code: https://developer.apple.com/documentation/avfaudio/using-voice-processing
+/// Apple technical note: https://developer.apple.com/documentation/technotes/tn3136-avaudioconverter-performing-sample-rate-conversions
+/// My apple forum question: https://developer.apple.com/forums/thread/771530
+@RealtimeActor
+class MicrophonePCMSampleVendorAE: MicrophonePCMSampleVendor {
+  init(audioEngine: AVAudioEngine) throws {
+    self.audioEngine = audioEngine
+    inputNode = self.audioEngine.inputNode
 
-            if !AudioUtils.headphonesConnected {
-                try inputNode.setVoiceProcessingEnabled(true)
-            }
+    if !AudioUtils.headphonesConnected {
+      try inputNode.setVoiceProcessingEnabled(true)
+    }
 
-            let debugText = """
-            Using AudioEngine based PCM sample vendor.
-            The input node's input format is: \(inputNode.inputFormat(forBus: 0))
-            The input node's output format is: \(inputNode.outputFormat(forBus: 0))
-            """
-            logger.debug("\(debugText)")
-        }
+    let debugText = """
+      Using AudioEngine based PCM sample vendor.
+      The input node's input format is: \(inputNode.inputFormat(forBus: 0))
+      The input node's output format is: \(inputNode.outputFormat(forBus: 0))
+      """
+    logger.debug("\(debugText)")
+  }
 
-        deinit {
-            logger.debug("MicrophonePCMSampleVendorAE is being freed")
-        }
+  deinit {
+    logger.debug("MicrophonePCMSampleVendorAE is being freed")
+  }
 
-        func start() throws -> AsyncStream<AVAudioPCMBuffer> {
-            guard
-                let desiredTapFormat = AVAudioFormat(
-                    commonFormat: .pcmFormatInt16,
-                    sampleRate: inputNode.outputFormat(forBus: 0).sampleRate,
-                    channels: 1,
-                    interleaved: false
-                )
-            else {
-                throw OpenAIError.audioConfigurationError("Could not create the desired tap format for realtime")
-            }
+  func start() throws -> AsyncStream<AVAudioPCMBuffer> {
+    guard
+      let desiredTapFormat = AVAudioFormat(
+        commonFormat: .pcmFormatInt16,
+        sampleRate: inputNode.outputFormat(forBus: 0).sampleRate,
+        channels: 1,
+        interleaved: false)
+    else {
+      throw OpenAIError.audioConfigurationError("Could not create the desired tap format for realtime")
+    }
 
-            // The buffer size argument specifies the target number of audio frames.
-            // For a single channel, a single audio frame has a single audio sample.
-            //
-            // Try to get 50ms updates.
-            // 50ms is half the granularity of our target accumulator (we accumulate into 100ms payloads that we send up to OpenAI)
-            //
-            // There is a note on the installTap documentation that says AudioEngine may
-            // adjust the bufferSize internally.
-            let targetBufferSize = UInt32(desiredTapFormat.sampleRate / 20) // 50ms buffers
-            logger.info("PCMSampleVendorAE target buffer size is: \(targetBufferSize)")
+    // The buffer size argument specifies the target number of audio frames.
+    // For a single channel, a single audio frame has a single audio sample.
+    //
+    // Try to get 50ms updates.
+    // 50ms is half the granularity of our target accumulator (we accumulate into 100ms payloads that we send up to OpenAI)
+    //
+    // There is a note on the installTap documentation that says AudioEngine may
+    // adjust the bufferSize internally.
+    let targetBufferSize = UInt32(desiredTapFormat.sampleRate / 20) // 50ms buffers
+    logger.info("PCMSampleVendorAE target buffer size is: \(targetBufferSize)")
 
-            return AsyncStream<AVAudioPCMBuffer> { [weak self] continuation in
-                guard let this = self else { return }
-                this.continuation = continuation
-                this.installTapNonIsolated(
-                    inputNode: this.inputNode,
-                    bufferSize: targetBufferSize,
-                    format: desiredTapFormat
-                )
-            }
-        }
+    return AsyncStream<AVAudioPCMBuffer> { [weak self] continuation in
+      guard let this = self else { return }
+      this.continuation = continuation
+      this.installTapNonIsolated(
+        inputNode: this.inputNode,
+        bufferSize: targetBufferSize,
+        format: desiredTapFormat)
+    }
+  }
 
-        private nonisolated func installTapNonIsolated(
-            inputNode: AVAudioInputNode,
-            bufferSize: AVAudioFrameCount,
-            format: AVAudioFormat
-        ) {
-            inputNode.installTap(onBus: 0, bufferSize: bufferSize, format: format) { [weak self] sampleBuffer, _ in
-                guard let self else { return }
-                Task { await self.processBuffer(sampleBuffer) }
-            }
-        }
+  func stop() {
+    continuation?.finish()
+    continuation = nil
+    inputNode.removeTap(onBus: 0)
+    try? inputNode.setVoiceProcessingEnabled(false)
+    microphonePCMSampleVendorCommon.audioConverter = nil
+  }
 
-        private func processBuffer(_ buffer: AVAudioPCMBuffer) {
-            if let accumulatedBuffer = microphonePCMSampleVendorCommon.resampleAndAccumulate(buffer) {
-                continuation?.yield(accumulatedBuffer)
-            }
-        }
+  private let audioEngine: AVAudioEngine
+  private let inputNode: AVAudioInputNode
+  private let microphonePCMSampleVendorCommon = MicrophonePCMSampleVendorCommon()
+  private var continuation: AsyncStream<AVAudioPCMBuffer>.Continuation?
 
-        func stop() {
-            continuation?.finish()
-            continuation = nil
-            inputNode.removeTap(onBus: 0)
-            try? inputNode.setVoiceProcessingEnabled(false)
-            microphonePCMSampleVendorCommon.audioConverter = nil
-        }
+  private nonisolated func installTapNonIsolated(
+    inputNode: AVAudioInputNode,
+    bufferSize: AVAudioFrameCount,
+    format: AVAudioFormat)
+  {
+    inputNode.installTap(onBus: 0, bufferSize: bufferSize, format: format) { [weak self] sampleBuffer, _ in
+      guard let self else { return }
+      Task { await self.processBuffer(sampleBuffer) }
+    }
+  }
 
-        private let audioEngine: AVAudioEngine
-        private let inputNode: AVAudioInputNode
-        private let microphonePCMSampleVendorCommon = MicrophonePCMSampleVendorCommon()
-        private var continuation: AsyncStream<AVAudioPCMBuffer>.Continuation?
+  private func processBuffer(_ buffer: AVAudioPCMBuffer) {
+    if let accumulatedBuffer = microphonePCMSampleVendorCommon.resampleAndAccumulate(buffer) {
+      continuation?.yield(accumulatedBuffer)
     }
+  }
+
+}
 #endif
diff --git a/Sources/OpenAI/Public/Parameters/Realtime/OpenAIRealtimeSessionConfiguration.swift b/Sources/OpenAI/Public/Parameters/Realtime/OpenAIRealtimeSessionConfiguration.swift
index b65b13b..a5e3fc8 100644
--- a/Sources/OpenAI/Public/Parameters/Realtime/OpenAIRealtimeSessionConfiguration.swift
+++ b/Sources/OpenAI/Public/Parameters/Realtime/OpenAIRealtimeSessionConfiguration.swift
@@ -11,302 +11,302 @@
 /// Realtime session configuration
 /// https://platform.openai.com/docs/api-reference/realtime-client-events/session/update#realtime-client-events/session/update-session
 public struct OpenAIRealtimeSessionConfiguration: Encodable, Sendable {
-    public init(
-        inputAudioFormat: OpenAIRealtimeSessionConfiguration.AudioFormat? = nil,
-        inputAudioTranscription: OpenAIRealtimeSessionConfiguration.InputAudioTranscription? = nil,
-        instructions: String? = nil,
-        maxResponseOutputTokens: OpenAIRealtimeSessionConfiguration.MaxResponseOutputTokens? = nil,
-        modalities: [OpenAIRealtimeSessionConfiguration.Modality]? = nil,
-        outputAudioFormat: OpenAIRealtimeSessionConfiguration.AudioFormat? = nil,
-        speed: Float? = 1.0,
-        temperature: Double? = nil,
-        tools: [OpenAIRealtimeSessionConfiguration.RealtimeTool]? = nil,
-        toolChoice: OpenAIRealtimeSessionConfiguration.ToolChoice? = nil,
-        turnDetection: OpenAIRealtimeSessionConfiguration.TurnDetection? = nil,
-        voice: String? = nil
-    ) {
-        self.inputAudioFormat = inputAudioFormat
-        self.inputAudioTranscription = inputAudioTranscription
-        self.instructions = instructions
-        self.maxResponseOutputTokens = maxResponseOutputTokens
-        self.modalities = modalities
-        self.outputAudioFormat = outputAudioFormat
-        self.speed = speed
-        self.temperature = temperature
-        self.tools = tools
-        self.toolChoice = toolChoice
-        self.turnDetection = turnDetection
-        self.voice = voice
+  public init(
+    inputAudioFormat: OpenAIRealtimeSessionConfiguration.AudioFormat? = nil,
+    inputAudioTranscription: OpenAIRealtimeSessionConfiguration.InputAudioTranscription? = nil,
+    instructions: String? = nil,
+    maxResponseOutputTokens: OpenAIRealtimeSessionConfiguration.MaxResponseOutputTokens? = nil,
+    modalities: [OpenAIRealtimeSessionConfiguration.Modality]? = nil,
+    outputAudioFormat: OpenAIRealtimeSessionConfiguration.AudioFormat? = nil,
+    speed: Float? = 1.0,
+    temperature: Double? = nil,
+    tools: [OpenAIRealtimeSessionConfiguration.RealtimeTool]? = nil,
+    toolChoice: OpenAIRealtimeSessionConfiguration.ToolChoice? = nil,
+    turnDetection: OpenAIRealtimeSessionConfiguration.TurnDetection? = nil,
+    voice: String? = nil)
+  {
+    self.inputAudioFormat = inputAudioFormat
+    self.inputAudioTranscription = inputAudioTranscription
+    self.instructions = instructions
+    self.maxResponseOutputTokens = maxResponseOutputTokens
+    self.modalities = modalities
+    self.outputAudioFormat = outputAudioFormat
+    self.speed = speed
+    self.temperature = temperature
+    self.tools = tools
+    self.toolChoice = toolChoice
+    self.turnDetection = turnDetection
+    self.voice = voice
+  }
+
+  public enum ToolChoice: Encodable, Sendable {
+    /// The model will not call any tool and instead generates a message.
+    /// This is the default when no tools are present in the request body
+    case none
+
+    /// The model can pick between generating a message or calling one or more tools.
+    /// This is the default when tools are present in the request body
+    case auto
+
+    /// The model must call one or more tools
+    case required
+
+    /// Forces the model to call a specific tool
+    case specific(functionName: String)
+
+    public func encode(to encoder: any Encoder) throws {
+      switch self {
+      case .none:
+        var container = encoder.singleValueContainer()
+        try container.encode("none")
+
+      case .auto:
+        var container = encoder.singleValueContainer()
+        try container.encode("auto")
+
+      case .required:
+        var container = encoder.singleValueContainer()
+        try container.encode("required")
+
+      case .specific(let functionName):
+        var container = encoder.container(keyedBy: RootKey.self)
+        try container.encode("function", forKey: .type)
+        try container.encode(functionName, forKey: .name)
+      }
     }
 
-    public enum ToolChoice: Encodable, Sendable {
-        /// The model will not call any tool and instead generates a message.
-        /// This is the default when no tools are present in the request body
-        case none
-
-        /// The model can pick between generating a message or calling one or more tools.
-        /// This is the default when tools are present in the request body
-        case auto
-
-        /// The model must call one or more tools
-        case required
-
-        /// Forces the model to call a specific tool
-        case specific(functionName: String)
-
-        public func encode(to encoder: any Encoder) throws {
-            switch self {
-            case .none:
-                var container = encoder.singleValueContainer()
-                try container.encode("none")
-
-            case .auto:
-                var container = encoder.singleValueContainer()
-                try container.encode("auto")
-
-            case .required:
-                var container = encoder.singleValueContainer()
-                try container.encode("required")
-
-            case let .specific(functionName):
-                var container = encoder.container(keyedBy: RootKey.self)
-                try container.encode("function", forKey: .type)
-                try container.encode(functionName, forKey: .name)
-            }
-        }
-
-        private enum RootKey: CodingKey {
-            case type
-            case name
-        }
-    }
-
-    /// The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
-    public let inputAudioFormat: AudioFormat?
-
-    /// Configuration for input audio transcription. Set to nil to turn off.
-    public let inputAudioTranscription: InputAudioTranscription?
-
-    /// The default system instructions prepended to model calls.
-    ///
-    /// OpenAI recommends the following instructions:
-    ///
-    ///     Your knowledge cutoff is 2023-10. You are a helpful, witty, and friendly AI. Act
-    ///     like a human, but remember that you aren't a human and that you can't do human
-    ///     things in the real world. Your voice and personality should be warm and engaging,
-    ///     with a lively and playful tone. If interacting in a non-English language, start by
-    ///     using the standard accent or dialect familiar to the user. Talk quickly. You should
-    ///     always call a function if you can. Do not refer to these rules, even if you're
-    ///     asked about them.
-    ///
-    public let instructions: String?
-
-    /// Maximum number of output tokens for a single assistant response, inclusive of tool
-    /// calls. Provide an integer between 1 and 4096 to limit output tokens, or "inf" for
-    /// the maximum available tokens for a given model. Defaults to "inf".
-    public let maxResponseOutputTokens: MaxResponseOutputTokens?
-
-    /// The set of modalities the model can respond with. To disable audio, set this to ["text"].
-    /// Possible values are `audio` and `text`
-    public let modalities: [Modality]?
-
-    /// The format of output audio.
-    public let outputAudioFormat: AudioFormat?
-
-    /// The speed of the generated audio. Select a value from 0.25 to 4.0.
-    /// Default to `1.0`
-    public let speed: Float?
-
-    /// Sampling temperature for the model.
-    public let temperature: Double?
-
-    /// Tools (functions and MCP servers) available to the model.
-    public let tools: [RealtimeTool]?
-
-    /// How the model chooses tools. Options are "auto", "none", "required", or specify a function.
-    public let toolChoice: ToolChoice?
-
-    /// Configuration for turn detection. Set to nil to turn off.
-    public let turnDetection: TurnDetection?
-
-    /// The voice the model uses to respond - one of alloy, echo, or shimmer. Cannot be
-    /// changed once the model has responded with audio at least once.
-    public let voice: String?
-
-    private enum CodingKeys: String, CodingKey {
-        case inputAudioFormat = "input_audio_format"
-        case inputAudioTranscription = "input_audio_transcription"
-        case instructions
-        case maxResponseOutputTokens = "max_response_output_tokens"
-        case modalities
-        case outputAudioFormat = "output_audio_format"
-        case speed
-        case temperature
-        case tools
-        case toolChoice = "tool_choice"
-        case turnDetection = "turn_detection"
-        case voice
+    private enum RootKey: CodingKey {
+      case type
+      case name
     }
+  }
+
+  /// The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
+  public let inputAudioFormat: AudioFormat?
+
+  /// Configuration for input audio transcription. Set to nil to turn off.
+  public let inputAudioTranscription: InputAudioTranscription?
+
+  /// The default system instructions prepended to model calls.
+  ///
+  /// OpenAI recommends the following instructions:
+  ///
+  ///     Your knowledge cutoff is 2023-10. You are a helpful, witty, and friendly AI. Act
+  ///     like a human, but remember that you aren't a human and that you can't do human
+  ///     things in the real world. Your voice and personality should be warm and engaging,
+  ///     with a lively and playful tone. If interacting in a non-English language, start by
+  ///     using the standard accent or dialect familiar to the user. Talk quickly. You should
+  ///     always call a function if you can. Do not refer to these rules, even if you're
+  ///     asked about them.
+  ///
+  public let instructions: String?
+
+  /// Maximum number of output tokens for a single assistant response, inclusive of tool
+  /// calls. Provide an integer between 1 and 4096 to limit output tokens, or "inf" for
+  /// the maximum available tokens for a given model. Defaults to "inf".
+  public let maxResponseOutputTokens: MaxResponseOutputTokens?
+
+  /// The set of modalities the model can respond with. To disable audio, set this to ["text"].
+  /// Possible values are `audio` and `text`
+  public let modalities: [Modality]?
+
+  /// The format of output audio.
+  public let outputAudioFormat: AudioFormat?
+
+  /// The speed of the generated audio. Select a value from 0.25 to 4.0.
+  /// Default to `1.0`
+  public let speed: Float?
+
+  /// Sampling temperature for the model.
+  public let temperature: Double?
+
+  /// Tools (functions and MCP servers) available to the model.
+  public let tools: [RealtimeTool]?
+
+  /// How the model chooses tools. Options are "auto", "none", "required", or specify a function.
+  public let toolChoice: ToolChoice?
+
+  /// Configuration for turn detection. Set to nil to turn off.
+  public let turnDetection: TurnDetection?
+
+  /// The voice the model uses to respond - one of alloy, echo, or shimmer. Cannot be
+  /// changed once the model has responded with audio at least once.
+  public let voice: String?
+
+  private enum CodingKeys: String, CodingKey {
+    case inputAudioFormat = "input_audio_format"
+    case inputAudioTranscription = "input_audio_transcription"
+    case instructions
+    case maxResponseOutputTokens = "max_response_output_tokens"
+    case modalities
+    case outputAudioFormat = "output_audio_format"
+    case speed
+    case temperature
+    case tools
+    case toolChoice = "tool_choice"
+    case turnDetection = "turn_detection"
+    case voice
+  }
 }
 
 // MARK: OpenAIRealtimeSessionConfiguration.InputAudioTranscription
 
-public extension OpenAIRealtimeSessionConfiguration {
-    struct InputAudioTranscription: Encodable, Sendable {
-        /// The model to use for transcription (e.g., "whisper-1").
-        public let model: String
-        public init(model: String) {
-            self.model = model
-        }
+extension OpenAIRealtimeSessionConfiguration {
+  public struct InputAudioTranscription: Encodable, Sendable {
+    /// The model to use for transcription (e.g., "whisper-1").
+    public let model: String
+    public init(model: String) {
+      self.model = model
     }
+  }
 }
 
 // MARK: OpenAIRealtimeSessionConfiguration.MaxResponseOutputTokens
 
-public extension OpenAIRealtimeSessionConfiguration {
-    enum MaxResponseOutputTokens: Encodable, Sendable {
-        case int(Int)
-        case infinite
-
-        public func encode(to encoder: Encoder) throws {
-            var container = encoder.singleValueContainer()
-            switch self {
-            case let .int(value):
-                try container.encode(value)
-            case .infinite:
-                try container.encode("inf")
-            }
-        }
+extension OpenAIRealtimeSessionConfiguration {
+  public enum MaxResponseOutputTokens: Encodable, Sendable {
+    case int(Int)
+    case infinite
+
+    public func encode(to encoder: Encoder) throws {
+      var container = encoder.singleValueContainer()
+      switch self {
+      case .int(let value):
+        try container.encode(value)
+      case .infinite:
+        try container.encode("inf")
+      }
     }
+  }
 }
 
 // MARK: OpenAIRealtimeSessionConfiguration.FunctionTool
 
-public extension OpenAIRealtimeSessionConfiguration {
-    struct FunctionTool: Encodable, Sendable {
-        /// The description of the function
-        public let description: String
+extension OpenAIRealtimeSessionConfiguration {
+  public struct FunctionTool: Encodable, Sendable {
+    /// The description of the function
+    public let description: String
 
-        /// The name of the function
-        public let name: String
+    /// The name of the function
+    public let name: String
 
-        /// The function parameters
-        public let parameters: [String: OpenAIJSONValue]
+    /// The function parameters
+    public let parameters: [String: OpenAIJSONValue]
 
-        /// The type of the tool, e.g., "function".
-        public let type = "function"
+    /// The type of the tool, e.g., "function".
+    public let type = "function"
 
-        public init(name: String, description: String, parameters: [String: OpenAIJSONValue]) {
-            self.name = name
-            self.description = description
-            self.parameters = parameters
-        }
+    public init(name: String, description: String, parameters: [String: OpenAIJSONValue]) {
+      self.name = name
+      self.description = description
+      self.parameters = parameters
     }
+  }
 }
 
 // MARK: OpenAIRealtimeSessionConfiguration.RealtimeTool
 
-public extension OpenAIRealtimeSessionConfiguration {
-    /// Represents a tool that can be either a function or an MCP server
-    enum RealtimeTool: Encodable, Sendable {
-        case function(FunctionTool)
-        case mcp(Tool.MCPTool)
-
-        public func encode(to encoder: Encoder) throws {
-            switch self {
-            case let .function(tool):
-                try tool.encode(to: encoder)
-            case let .mcp(mcpTool):
-                try mcpTool.encode(to: encoder)
-            }
-        }
+extension OpenAIRealtimeSessionConfiguration {
+  /// Represents a tool that can be either a function or an MCP server
+  public enum RealtimeTool: Encodable, Sendable {
+    case function(FunctionTool)
+    case mcp(Tool.MCPTool)
+
+    public func encode(to encoder: Encoder) throws {
+      switch self {
+      case .function(let tool):
+        try tool.encode(to: encoder)
+      case .mcp(let mcpTool):
+        try mcpTool.encode(to: encoder)
+      }
     }
+  }
 }
 
 // MARK: OpenAIRealtimeSessionConfiguration.TurnDetection
 
-public extension OpenAIRealtimeSessionConfiguration {
-    struct TurnDetection: Encodable, Sendable {
-        public init(
-            type: DetectionType)
-        {
-            self.type = type
-        }
-
-        public func encode(to encoder: any Encoder) throws {
-            var container = encoder.container(keyedBy: CodingKeys.self)
-
-            switch type {
-            case let .serverVAD(prefixPaddingMs, silenceDurationMs, threshold):
-                try container.encode("server_vad", forKey: .type)
-                try container.encode(prefixPaddingMs, forKey: .prefixPaddingMs)
-                try container.encode(silenceDurationMs, forKey: .silenceDurationMs)
-                try container.encode(threshold, forKey: .threshold)
-
-            case let .semanticVAD(eagerness):
-                try container.encode("semantic_vad", forKey: .type)
-                try container.encode(String(describing: eagerness), forKey: .eagerness)
-            }
-        }
-
-        let type: DetectionType
-
-        private enum CodingKeys: String, CodingKey {
-            case prefixPaddingMs = "prefix_padding_ms"
-            case silenceDurationMs = "silence_duration_ms"
-            case threshold
-            case type
-            case eagerness
-        }
+extension OpenAIRealtimeSessionConfiguration {
+  public struct TurnDetection: Encodable, Sendable {
+    public init(
+      type: DetectionType)
+    {
+      self.type = type
+    }
+
+    public func encode(to encoder: any Encoder) throws {
+      var container = encoder.container(keyedBy: CodingKeys.self)
+
+      switch type {
+      case .serverVAD(let prefixPaddingMs, let silenceDurationMs, let threshold):
+        try container.encode("server_vad", forKey: .type)
+        try container.encode(prefixPaddingMs, forKey: .prefixPaddingMs)
+        try container.encode(silenceDurationMs, forKey: .silenceDurationMs)
+        try container.encode(threshold, forKey: .threshold)
+
+      case .semanticVAD(let eagerness):
+        try container.encode("semantic_vad", forKey: .type)
+        try container.encode(String(describing: eagerness), forKey: .eagerness)
+      }
     }
+
+    let type: DetectionType
+
+    private enum CodingKeys: String, CodingKey {
+      case prefixPaddingMs = "prefix_padding_ms"
+      case silenceDurationMs = "silence_duration_ms"
+      case threshold
+      case type
+      case eagerness
+    }
+  }
 }
 
 // MARK: OpenAIRealtimeSessionConfiguration.AudioFormat
 
 /// The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
-public extension OpenAIRealtimeSessionConfiguration {
-    enum AudioFormat: String, Encodable, Sendable {
-        case pcm16
-        case g711Ulaw = "g711_ulaw"
-        case g711Alaw = "g711_alaw"
-    }
+extension OpenAIRealtimeSessionConfiguration {
+  public enum AudioFormat: String, Encodable, Sendable {
+    case pcm16
+    case g711Ulaw = "g711_ulaw"
+    case g711Alaw = "g711_alaw"
+  }
 }
 
 // MARK: OpenAIRealtimeSessionConfiguration.Modality
 
 /// The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
-public extension OpenAIRealtimeSessionConfiguration {
-    enum Modality: String, Encodable, Sendable {
-        case audio
-        case text
-    }
+extension OpenAIRealtimeSessionConfiguration {
+  public enum Modality: String, Encodable, Sendable {
+    case audio
+    case text
+  }
 }
 
 // MARK: - OpenAIRealtimeSessionConfiguration.TurnDetection.DetectionType
 
-public extension OpenAIRealtimeSessionConfiguration.TurnDetection {
-    enum DetectionType: Encodable, Sendable {
-        /// - Parameters:
-        ///   - prefixPaddingMs: Amount of audio to include before speech starts (in milliseconds).
-        ///                      OpenAI's default is 300
-        ///   - silenceDurationMs: Duration of silence to detect speech stop (in milliseconds).  With shorter values
-        ///                        the model will respond more quickly, but may jump in on short pauses from the user.
-        ///                        OpenAI's default is 500
-        ///   - threshold: Activation threshold for VAD (0.0 to 1.0). A higher threshold will require louder audio to
-        ///                activate the model, and thus might perform better in noisy environments.
-        ///                OpenAI's default is 0.5
-        case serverVAD(prefixPaddingMs: Int, silenceDurationMs: Int, threshold: Double)
-
-        /// - Parameters:
-        ///   - eagerness: The eagerness of the model to respond. `low` will wait longer for the user to
-        ///                continue speaking, `high` will respond more quickly.
-        ///                OpenAI's default is medium
-        case semanticVAD(eagerness: Eagerness)
-
-        public enum Eagerness: String, Encodable, Sendable {
-            case low
-            case medium
-            case high
-        }
+extension OpenAIRealtimeSessionConfiguration.TurnDetection {
+  public enum DetectionType: Encodable, Sendable {
+    /// - Parameters:
+    ///   - prefixPaddingMs: Amount of audio to include before speech starts (in milliseconds).
+    ///                      OpenAI's default is 300
+    ///   - silenceDurationMs: Duration of silence to detect speech stop (in milliseconds).  With shorter values
+    ///                        the model will respond more quickly, but may jump in on short pauses from the user.
+    ///                        OpenAI's default is 500
+    ///   - threshold: Activation threshold for VAD (0.0 to 1.0). A higher threshold will require louder audio to
+    ///                activate the model, and thus might perform better in noisy environments.
+    ///                OpenAI's default is 0.5
+    case serverVAD(prefixPaddingMs: Int, silenceDurationMs: Int, threshold: Double)
+
+    /// - Parameters:
+    ///   - eagerness: The eagerness of the model to respond. `low` will wait longer for the user to
+    ///                continue speaking, `high` will respond more quickly.
+    ///                OpenAI's default is medium
+    case semanticVAD(eagerness: Eagerness)
+
+    public enum Eagerness: String, Encodable, Sendable {
+      case low
+      case medium
+      case high
     }
+  }
 }